This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] nvptx: implement automatic storage in custom stacks


Hello,

I'm proposing the following patch as a step towards resolving the issue with
inaccessibility of stack storage (.local memory) in PTX to other threads than
the one using that stack.  The idea is to have preallocated stacks, and have
__nvptx_stacks[] array in shared memory hold current stack pointers.  Each
thread is maintaining __nvptx_stacks[tid.y] as its stack pointer, thus for
OpenMP the intent is to preallocate on a per-warp basis (not per-thread).
For OpenMP SIMD regions we'll have to ensure that conflicting accesses are not
introduced.

I've exposed a new command-line option -msoft-stack to ease testing, but for
OpenMP we'll have to automatically flip it based on function attributes.
Right now it's not easy because OpenMP and OpenACC both use "omp declare
target".  Jakub, I seem to recall a discussion about OpenACC changing to use a
separate attribute, but I cannot find it now.  Any advice here?

This approach also allows to implement alloca.  However, to drop
alloca-avoiding changes in libgomp we'd have to selectively enable
-msoft-stack there, only for functions that OpenACC wouldn't use.

I've run it through make -k check-c regtesting.  These are new fails, all
mysterious:

+FAIL: gcc.c-torture/execute/20090113-2.c   -O[123s]  execution test
Execution failure with invalid memory access.

+FAIL: gcc.c-torture/execute/20090113-3.c   -O[123s]  execution test
Times out (looping infinitely).

The above two I had difficulties investigating due to cuda-gdb 7.0 not showing
dissassembly for the misbehaving function.

+FAIL: gcc.c-torture/execute/loop-15.c   -O2  execution test
Rather surprising and unclear failure due to branch stack overflow.

There are also tests that now pass:
+PASS: gcc.c-torture/execute/20020529-1.c   -O0  execution test
Used to fail with invalid memory access.

+PASS: gcc.dg/sibcall-9.c execution test
(not meaningful on NVPTX)

+PASS: gcc.dg/torture/pr54261-1.c   -O[0123s]  execution test
Atomic modification to stack variables now works.

gcc/
	* config/nvptx/nvptx.c (need_softstack_decl): Declare.
	(nvptx_declare_function_name): Handle TARGET_SOFT_STACK.
	(nvptx_output_return): Restore stack pointer if needed.
	(nvptx_file_end): Emit declaration of __nvptx_stacks.
	* config/nvptx/nvptx.opt (msoft-stack): New option.
	* doc/invoke.texi (-msoft-stack): Document.

libgcc/
	* config/nvptx/crt0.s (__nvptx_stacks): Define.
	(%__softstack): Define 128 KiB stack for -msoft-stack.
	(__main): Setup __nvptx_stacks.

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 0204ad3..df915b9 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -114,6 +114,9 @@ static unsigned worker_red_align;
 #define worker_red_name "__worker_red"
 static GTY(()) rtx worker_red_sym;
 
+/* True if any function references __nvptx_stacks.  */
+static bool need_softstack_decl;
+
 /* Allocate a new, cleared machine_function structure.  */
 
 static struct machine_function *
@@ -689,15 +692,46 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
 
   /* Declare a local variable for the frame.  */
   sz = get_frame_size ();
-  if (sz > 0 || cfun->machine->has_call_with_sc)
+  if (sz == 0 && cfun->machine->has_call_with_sc)
+    sz = 1;
+  if (sz > 0)
     {
       int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
 
-      fprintf (file, "\t.reg.u%d %%frame;\n"
-	       "\t.local.align %d .b8 %%farray[" HOST_WIDE_INT_PRINT_DEC"];\n",
-	       BITS_PER_WORD, alignment, sz == 0 ? 1 : sz);
-      fprintf (file, "\tcvta.local.u%d %%frame, %%farray;\n",
-	       BITS_PER_WORD);
+      fprintf (file, "\t.reg.u%d %%frame;\n", BITS_PER_WORD);
+      if (TARGET_SOFT_STACK)
+	{
+	  /* Maintain 64-bit stack alignment.  */
+	  int keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
+	  sz = (sz + keep_align - 1) & ~(keep_align - 1);
+	  int bits = BITS_PER_WORD;
+	  fprintf (file, "\t.reg.u32 %%fstmp0;\n");
+	  fprintf (file, "\t.reg.u%d %%fstmp1;\n", bits);
+	  fprintf (file, "\t.reg.u%d %%fstmp2;\n", bits);
+	  fprintf (file, "\tmov.u32 %%fstmp0, %%tid.y;\n");
+	  fprintf (file, "\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
+	           bits == 64 ? ".wide" : "", bits);
+	  fprintf (file, "\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
+	  /* fstmp2 = &__nvptx_stacks[tid.y];  */
+	  fprintf (file, "\tadd.u%d %%fstmp2, %%fstmp2, %%fstmp1;\n", bits);
+	  fprintf (file, "\tld.shared.u%d %%fstmp1, [%%fstmp2];\n", bits);
+	  fprintf (file, "\tsub.u%d %%frame, %%fstmp1, "
+	           HOST_WIDE_INT_PRINT_DEC ";\n", bits, sz);
+	  if (alignment > keep_align)
+	    fprintf (file, "\tand.b%d %%frame, %%frame, %d;\n",
+		     bits, -alignment);
+	  if (!crtl->is_leaf)
+	    fprintf (file, "\tst.shared.u%d [%%fstmp2], %%frame;\n", bits);
+	  need_softstack_decl = true;
+	}
+      else
+	{
+	  fprintf (file, "\t.local.align %d "
+		   ".b8 %%farray[" HOST_WIDE_INT_PRINT_DEC"];\n",
+		   alignment, sz);
+	  fprintf (file, "\tcvta.local.u%d %%frame, %%farray;\n",
+		   BITS_PER_WORD);
+	}
     }
 
   if (cfun->machine->has_call_with_varargs)
@@ -734,6 +768,13 @@ nvptx_output_return (void)
 {
   machine_mode mode = (machine_mode)cfun->machine->ret_reg_mode;
 
+  if (TARGET_SOFT_STACK
+      && !crtl->is_leaf
+      && (get_frame_size () > 0 || cfun->machine->has_call_with_sc))
+    {
+      int bits = BITS_PER_WORD;
+      fprintf (asm_out_file, "\tst.shared.u%d [%%fstmp2], %%fstmp1;\n", bits);
+    }
   if (mode != VOIDmode)
     {
       mode = arg_promotion (mode);
@@ -3278,6 +3319,11 @@ nvptx_file_end (void)
 	       worker_red_align,
 	       worker_red_name, worker_red_size);
     }
+
+  if (need_softstack_decl)
+    {
+      fprintf (asm_out_file, ".extern .shared .u64 __nvptx_stacks[];\n;");
+    }
 }
 
 /* Expander for the shuffle builtins.  */
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index 8017046..7ab09b9 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -28,3 +28,7 @@ Generate code for a 64-bit ABI.
 mmainkernel
 Target Report RejectNegative
 Link in code for a __main kernel.
+
+msoft-stack
+Target Report Mask(SOFT_STACK)
+Use custom stacks instead of local memory for automatic storage.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 587e30e..6e45fb6 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -18935,6 +18935,13 @@ Generate code for 32-bit or 64-bit ABI.
 Link in code for a __main kernel.  This is for stand-alone instead of
 offloading execution.
 
+@item -msoft-stack
+@opindex msoft-stack
+Do not use @code{.local} memory for automatic storage.  Instead, use pointer
+in shared memory array @code{char *__nvptx_stacks[]} at position @code{tid.y}
+as the stack pointer.  This is for placing automatic variables into storage
+that can be accessed from other threads, or modified with atomic instructions.
+
 @end table
 
 @node PDP-11 Options
diff --git a/libgcc/config/nvptx/crt0.s b/libgcc/config/nvptx/crt0.s
index 38327ed..7a42e87 100644
--- a/libgcc/config/nvptx/crt0.s
+++ b/libgcc/config/nvptx/crt0.s
@@ -22,6 +22,9 @@
         exit;
 }
 
+.visible .shared .u64 __nvptx_stacks[1];
+.global .u64 %__softstack[16384];
+
 .extern .func (.param.u32 retval) main (.param.u32 argc, .param.u64 argv);
 
 .visible .entry __main (.param .u64 __retval, .param.u32 __argc, .param.u64 __argv)
@@ -34,6 +37,12 @@
         ld.param.u64    %rd0, [__retval];
         st.global.u64   [%__exitval], %rd0;
 
+        .reg .u64 %stackptr;
+        mov.u64	%stackptr, %__softstack;
+        cvta.global.u64	%stackptr, %stackptr;
+        add.u64	%stackptr, %stackptr, 131072;
+        st.shared.u64	[__nvptx_stacks], %stackptr;
+
 	ld.param.u32	%r1, [__argc];
 	ld.param.u64	%rd1, [__argv];
 	st.param.u32	[%argc], %r1;
-- 
1.8.3.1


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]