This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[gomp-nvptx 5/5] libgomp plugin: manage soft-stack storage
- From: Alexander Monakov <amonakov at ispras dot ru>
- To: gcc-patches at gcc dot gnu dot org
- Date: Mon, 15 Feb 2016 21:44:12 +0300
- Subject: [gomp-nvptx 5/5] libgomp plugin: manage soft-stack storage
- Authentication-results: sourceware.org; auth=none
- References: <1455561852-9237-1-git-send-email-amonakov at ispras dot ru>
This patch implements the libgomp plugin part of the transition to
host-allocated soft stacks. For now only a simple scheme with
allocation/deallocation per launch is implemented; a followup change is
planned to cache and reuse allocations when appropriate.
The call to cuLaunchKernel is changed to pass kernel entry function arguments
in a way that allows the driver to check for mismatch (but only when the
cumulative size of passed arguments is different).
* plugin/plugin-nvptx.c (nvptx_stacks_size): New.
(nvptx_stacks_alloc): New.
(nvptx_stacks_free): New.
(GOMP_OFFLOAD_run): Allocate soft-stacks storage from the host using
the above new functions. Use kernel launch interface that allows
checking for mismatched total size of entry function arguments.
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index cb6a3ac..adf57b1 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1892,6 +1892,37 @@ nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
*teams_p = max_blocks;
}
+/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
+ target regions. */
+
+static size_t
+nvptx_stacks_size ()
+{
+ return 128 * 1024;
+}
+
+/* Return contiguous storage for NUM stacks, each SIZE bytes. */
+
+static void *
+nvptx_stacks_alloc (size_t size, int num)
+{
+ CUdeviceptr stacks;
+ CUresult r = cuMemAlloc (&stacks, size * num);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
+ return (void *) stacks;
+}
+
+/* Release storage previously allocated by nvptx_stacks_alloc. */
+
+static void
+nvptx_stacks_free (void *p, int num)
+{
+ CUresult r = cuMemFree ((CUdeviceptr) p);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+}
+
void
GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
{
@@ -1899,7 +1930,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
CUresult r;
struct ptx_device *ptx_dev = ptx_devices[ord];
const char *maybe_abort_msg = "(perhaps abort was called)";
- void *fn_args = &tgt_vars;
int teams = 0, threads = 0;
if (!args)
@@ -1922,10 +1952,19 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
}
nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
+ size_t stack_size = nvptx_stacks_size ();
+ void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
+ void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
+ size_t fn_args_size = sizeof fn_args;
+ void *config[] = {
+ CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
+ CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
+ CU_LAUNCH_PARAM_END
+ };
r = cuLaunchKernel (function,
teams, 1, 1,
32, threads, 1,
- 0, ptx_dev->null_stream->stream, &fn_args, 0);
+ 0, ptx_dev->null_stream->stream, NULL, config);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
@@ -1935,6 +1974,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
maybe_abort_msg);
else if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
+ nvptx_stacks_free (stacks, teams * threads);
}
void