This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] nvptx per-warp compiler-defined stacks (-msoft-stack)


This patch implements per-warp compiler-defined stacks under -msoft-stack
option, and implements alloca on top of that.  In a few obvious places,
changes from -muniform-simt patch are present in the hunks.

Previously posted here:

[PATCH] nvptx: implement automatic storage in custom stacks
https://gcc.gnu.org/ml/gcc-patches/2015-11/msg01519.html

[gomp-nvptx] nvptx backend: implement alloca with -msoft-stack
https://gcc.gnu.org/ml/gcc-patches/2015-12/msg01397.html

[gomp-nvptx 7/7] nvptx backend: define STACK_SIZE_MODE
https://gcc.gnu.org/ml/gcc-patches/2016-03/msg01108.html

2016-03-15  Alexander Monakov  <amonakov@ispras.ru>

	* config/nvptx/nvptx.h (STACK_SIZE_MODE): Define.

2015-12-14  Alexander Monakov  <amonakov@ispras.ru>

	* config/nvptx/nvptx.c (nvptx_declare_function_name): Emit %outargs
	using .local %outargs_ar only if not TARGET_SOFT_STACK.  Emit %outargs
	under TARGET_SOFT_STACK by offsetting from %frame.
	(nvptx_get_drap_rtx): Return %argp as the DRAP if needed.
	* config/nvptx/nvptx.md (nvptx_register_operand): Allow %outargs under
	TARGET_SOFT_STACK.
	(nvptx_nonimmediate_operand): Ditto.
	(allocate_stack): Implement for TARGET_SOFT_STACK.  Remove unused code.
	(allocate_stack_<mode>): Remove unused pattern.
	(set_softstack_insn): New pattern.
	(restore_stack_block): Handle for TARGET_SOFT_STACK.

2015-12-09  Alexander Monakov  <amonakov@ispras.ru>

	* config/nvptx/nvptx.c: (need_softstack_decl): New variable.
	(nvptx_declare_function_name): Handle TARGET_SOFT_STACK.
	(nvptx_output_return): Emit stack restore if needed.
	(nvptx_file_end): Handle need_softstack_decl.
	* config/nvptx/nvptx.h: (TARGET_CPU_CPP_BUILTINS): Define
	__nvptx_softstack__ when -msoft-stack is active.
	(struct machine_function): New bool field using_softstack.
	* config/nvptx/nvptx.opt: (msoft-stack): New option.
	* doc/invoke.texi (msoft-stack): Document.

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 2d4dad1..e9e4d06 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -139,6 +129,12 @@ static GTY(()) rtx worker_red_sym;
 /* Global lock variable, needed for 128bit worker & gang reductions.  */
 static GTY(()) tree global_lock_var;
 
+/* True if any function references __nvptx_stacks.  */
+static bool need_softstack_decl;
+
+/* True if any function references __nvptx_uni.  */
+static bool need_unisimt_decl;
+
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -992,16 +1086,55 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
 
   fprintf (file, "%s", s.str().c_str());
 
-  /* Declare a local var for outgoing varargs.  */
-  if (cfun->machine->has_varadic)
-    init_frame (file, STACK_POINTER_REGNUM,
-		UNITS_PER_WORD, crtl->outgoing_args_size);
-
-  /* Declare a local variable for the frame.  */
   HOST_WIDE_INT sz = get_frame_size ();
-  if (sz || cfun->machine->has_chain)
-    init_frame (file, FRAME_POINTER_REGNUM,
-		crtl->stack_alignment_needed / BITS_PER_UNIT, sz);
+  bool need_frameptr = sz || cfun->machine->has_chain;
+  int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
+  if (!TARGET_SOFT_STACK)
+    {
+      /* Declare a local var for outgoing varargs.  */
+      if (cfun->machine->has_varadic)
+	init_frame (file, STACK_POINTER_REGNUM,
+		    UNITS_PER_WORD, crtl->outgoing_args_size);
+
+      /* Declare a local variable for the frame.  */
+      if (need_frameptr)
+	init_frame (file, FRAME_POINTER_REGNUM, alignment, sz);
+    }
+  else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca)
+    {
+      /* Maintain 64-bit stack alignment.  */
+      int keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+      sz = ROUND_UP (sz, keep_align);
+      int bits = POINTER_SIZE;
+      fprintf (file, "\t.reg.u%d %%frame;\n", bits);
+      fprintf (file, "\t.reg.u32 %%fstmp0;\n");
+      fprintf (file, "\t.reg.u%d %%fstmp1;\n", bits);
+      fprintf (file, "\t.reg.u%d %%fstmp2;\n", bits);
+      fprintf (file, "\tmov.u32 %%fstmp0, %%tid.y;\n");
+      fprintf (file, "\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
+	       bits == 64 ? ".wide" : ".lo", bits / 8);
+      fprintf (file, "\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
+      /* fstmp2 = &__nvptx_stacks[tid.y];  */
+      fprintf (file, "\tadd.u%d %%fstmp2, %%fstmp2, %%fstmp1;\n", bits);
+      fprintf (file, "\tld.shared.u%d %%fstmp1, [%%fstmp2];\n", bits);
+      fprintf (file, "\tsub.u%d %%frame, %%fstmp1, "
+	       HOST_WIDE_INT_PRINT_DEC ";\n", bits, sz);
+      if (alignment > keep_align)
+	fprintf (file, "\tand.b%d %%frame, %%frame, %d;\n",
+		 bits, -alignment);
+      fprintf (file, "\t.reg.u%d %%stack;\n", bits);
+      sz = crtl->outgoing_args_size;
+      gcc_assert (sz % keep_align == 0);
+      fprintf (file, "\tsub.u%d %%stack, %%frame, "
+	       HOST_WIDE_INT_PRINT_DEC ";\n", bits, sz);
+      /* crtl->is_leaf is not initialized because RA is not run.  */
+      if (!leaf_function_p ())
+	{
+	  fprintf (file, "\tst.shared.u%d [%%fstmp2], %%stack;\n", bits);
+	  cfun->machine->using_softstack = true;
+	}
+      need_softstack_decl = true;
+    }
 
   /* Declare the pseudos we have as ptx registers.  */
   int maxregs = max_reg_num ();
@@ -1037,6 +1172,10 @@ nvptx_output_return (void)
 {
   machine_mode mode = (machine_mode)cfun->machine->return_mode;
 
+  if (cfun->machine->using_softstack)
+    fprintf (asm_out_file, "\tst.shared.u%d [%%fstmp2], %%fstmp1;\n",
+	     POINTER_SIZE);
+
   if (mode != VOIDmode)
     fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
 	     nvptx_ptx_type_from_mode (mode, false),
@@ -1068,6 +1207,8 @@ nvptx_function_ok_for_sibcall (tree, tree)
 static rtx
 nvptx_get_drap_rtx (void)
 {
+  if (TARGET_SOFT_STACK && stack_realign_drap)
+    return arg_pointer_rtx;
   return NULL_RTX;
 }
 
@@ -3939,6 +4183,18 @@ nvptx_file_end (void)
   if (worker_red_size)
     write_worker_buffer (asm_out_file, worker_red_sym,
 			 worker_red_align, worker_red_size);
+
+  if (need_softstack_decl)
+    {
+      write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
+      fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
+	       POINTER_SIZE);
+    }
+  if (need_unisimt_decl)
+    {
+      write_var_marker (asm_out_file, false, true, "__nvptx_uni");
+      fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
+    }
 }
 
 /* Expander for the shuffle builtins.  */
diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h
index 381269e..6da4d06 100644
--- a/gcc/config/nvptx/nvptx.h
+++ b/gcc/config/nvptx/nvptx.h
@@ -31,6 +31,10 @@
       builtin_assert ("machine=nvptx");		\
       builtin_assert ("cpu=nvptx");		\
       builtin_define ("__nvptx__");		\
+      if (TARGET_SOFT_STACK)			\
+        builtin_define ("__nvptx_softstack__");	\
+      if (TARGET_UNIFORM_SIMT)			\
+        builtin_define ("__nvptx_unisimt__");	\
     } while (0)
 
 /* Avoid the default in ../../gcc.c, which adds "-pthread", which is not
@@ -79,6 +83,7 @@
 
 #define POINTER_SIZE (TARGET_ABI64 ? 64 : 32)
 #define Pmode (TARGET_ABI64 ? DImode : SImode)
+#define STACK_SIZE_MODE Pmode
 
 /* Registers.  Since ptx is a virtual target, we just define a few
    hard registers for special purposes and leave pseudos unallocated.
@@ -200,10 +205,13 @@ struct GTY(()) machine_function
   bool is_varadic;  /* This call is varadic  */
   bool has_varadic;  /* Current function has a varadic call.  */
   bool has_chain; /* Current function has outgoing static chain.  */
+  bool using_softstack; /* Need to restore __nvptx_stacks[tid.y].  */
   int num_args;	/* Number of args of current call.  */
   int return_mode; /* Return mode of current fn.
 		      (machine_mode not defined yet.) */
   rtx axis_predicate[2]; /* Neutering predicates.  */
+  rtx unisimt_master; /* Master lane index for "uniform simt" mode.  */
+  rtx unisimt_predicate; /* Predicate register for "uniform simt".  */
 };
 #endif
 
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 33a4862..e5650b6 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -961,31 +986,41 @@ (define_expand "allocate_stack"
    (match_operand 1 "nvptx_register_operand")]
   ""
 {
+  if (TARGET_SOFT_STACK)
+    {
+      emit_move_insn (stack_pointer_rtx,
+		      gen_rtx_MINUS (Pmode, stack_pointer_rtx, operands[1]));
+      emit_insn (gen_set_softstack_insn (stack_pointer_rtx));
+      emit_move_insn (operands[0], virtual_stack_dynamic_rtx);
+      DONE;
+    }
   /* The ptx documentation specifies an alloca intrinsic (for 32 bit
      only)  but notes it is not implemented.  The assembler emits a
      confused error message.  Issue a blunt one now instead.  */
   sorry ("target cannot support alloca.");
   emit_insn (gen_nop ());
   DONE;
-  if (TARGET_ABI64)
-    emit_insn (gen_allocate_stack_di (operands[0], operands[1]));
-  else
-    emit_insn (gen_allocate_stack_si (operands[0], operands[1]));
-  DONE;
 })
 
-(define_insn "allocate_stack_<mode>"
-  [(set (match_operand:P 0 "nvptx_register_operand" "=R")
-        (unspec:P [(match_operand:P 1 "nvptx_register_operand" "R")]
-                   UNSPEC_ALLOCA))]
-  ""
-  "%.\\tcall (%0), %%alloca, (%1);")
+(define_insn "set_softstack_insn"
+  [(unspec [(match_operand 0 "nvptx_register_operand" "R")] UNSPEC_ALLOCA)]
+  "TARGET_SOFT_STACK"
+{
+  return (cfun->machine->using_softstack
+	  ? "%.\\tst.shared%t0\\t[%%fstmp2], %0;"
+	  : "");
+})
 
 (define_expand "restore_stack_block"
   [(match_operand 0 "register_operand" "")
    (match_operand 1 "register_operand" "")]
   ""
 {
+  if (TARGET_SOFT_STACK)
+    {
+      emit_move_insn (operands[0], operands[1]);
+      emit_insn (gen_set_softstack_insn (operands[0]));
+    }
   DONE;
 })
 



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]