This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH 2/3, libgomp] nvptx plugin parts
- From: Chung-Lin Tang <cltang at codesourcery dot com>
- To: gcc-patches <gcc-patches at gcc dot gnu dot org>
- Cc: Nathan Sidwell <nathan at codesourcery dot com>, Thomas Schwinge <thomas at codesourcery dot com>
- Date: Thu, 27 Aug 2015 21:45:00 +0800
- Subject: [PATCH 2/3, libgomp] nvptx plugin parts
- Authentication-results: sourceware.org; auth=none
These are the nvptx plugin specific parts.
Chung-Lin
* plugin/plugin-nvptx.c (CUDA_CALL_ERET): New convenience macro.
(CUDA_CALL): Likewise.
(CUDA_CALL_ASSERT): Likewise.
(map_init): Change return type to bool, use CUDA_CALL* macros.
(map_fini): Likewise.
(init_streams_for_device): Change return type to bool, adjust
call to map_init.
(fini_streams_for_device): Change return type to bool, adjust
call to map_fini.
(select_stream_for_async): Release stream_lock before calls to
GOMP_PLUGIN_fatal, adjust call to map_init.
(nvptx_init): Use CUDA_CALL* macros.
(nvptx_attach_host_thread_to_device): Change return type to bool,
use CUDA_CALL* macros.
(nvptx_open_device): Use CUDA_CALL* macros.
(nvptx_close_device): Change return type to bool, use CUDA_CALL* macros.
(nvptx_get_num_devices): Use CUDA_CALL* macros.
(link_ptx): Change return type to bool, use CUDA_CALL* macros.
(nvptx_exec): Use CUDA_CALL* macros.
(nvptx_alloc): Change return type to bool, use CUDA_CALL* macros,
change to use out parameter to return allocated pointer.
(nvptx_free): Change return type to bool, use CUDA_CALL* macros.
(nvptx_host2dev): Likewise.
(nvptx_dev2host): Likewise.
(nvptx_wait): Use CUDA_CALL* macros.
(nvptx_wait_async): Likewise.
(nvptx_wait_all): Likewise.
(nvptx_wait_all_async): Likewise.
(nvptx_set_cuda_stream): Adjust order of stream_lock acquire,
use CUDA_CALL* macros, adjust call to map_fini.
(GOMP_OFFLOAD_init_device): Change return type to bool,
adjust code accordingly.
(GOMP_OFFLOAD_fini_device): Likewise.
(GOMP_OFFLOAD_load_image): Adjust calls to
nvptx_attach_host_thread_to_device/link_ptx to handle errors,
use CUDA_CALL* macros.
(GOMP_OFFLOAD_alloc): Change return type to bool, adjust calls
to code to handle error return.
(GOMP_OFFLOAD_free): Likewise.
(GOMP_OFFLOAD_dev2host): Likewise.
(GOMP_OFFLOAD_host2dev): Likewise.
(GOMP_OFFLOAD_openacc_register_async_cleanup): Use CUDA_CALL* macros.
(GOMP_OFFLOAD_openacc_create_thread_data): Likewise.
Index: libgomp/plugin/plugin-nvptx.c
===================================================================
--- libgomp/plugin/plugin-nvptx.c (revision 227257)
+++ libgomp/plugin/plugin-nvptx.c (working copy)
@@ -127,6 +127,34 @@ cuda_error (CUresult r)
return errmsg;
}
+/* Convenience macros for the frequently used CUDA library call and
+ error handling sequence. This does not capture all the cases we
+ use in this file, but is common enough. */
+
+#define CUDA_CALL_ERET(ERET, FN, ...) \
+ do { \
+ unsigned __r = FN (__VA_ARGS__); \
+ if (__r != CUDA_SUCCESS) \
+ { \
+ GOMP_PLUGIN_error (#FN " error: %s", \
+ cuda_error (__r)); \
+ return ERET; \
+ } \
+ } while (0)
+
+#define CUDA_CALL(FN, ...) \
+ CUDA_CALL_ERET (false, (FN), __VA_ARGS__)
+
+#define CUDA_CALL_ASSERT(FN, ...) \
+ do { \
+ unsigned __r = FN (__VA_ARGS__); \
+ if (__r != CUDA_SUCCESS) \
+ { \
+ GOMP_PLUGIN_fatal (#FN " error: %s", \
+ cuda_error (__r)); \
+ } \
+ } while (0)
+
static unsigned int instantiated_devices = 0;
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
@@ -162,25 +190,18 @@ struct map
char mappings[0];
};
-static void
+static bool
map_init (struct ptx_stream *s)
{
- CUresult r;
-
int size = getpagesize ();
assert (s);
assert (!s->d);
assert (!s->h);
- r = cuMemAllocHost (&s->h, size);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
+ CUDA_CALL (cuMemAllocHost, &s->h, size);
+ CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
- r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
-
assert (s->h);
s->h_begin = s->h;
@@ -189,16 +210,14 @@ map_init (struct ptx_stream *s)
assert (s->h_next);
assert (s->h_end);
+ return true;
}
-static void
+static bool
map_fini (struct ptx_stream *s)
{
- CUresult r;
-
- r = cuMemFreeHost (s->h);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
+ CUDA_CALL (cuMemFreeHost, s->h);
+ return true;
}
static void
@@ -359,7 +378,7 @@ nvptx_thread (void)
return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
}
-static void
+static bool
init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
{
int i;
@@ -371,9 +390,10 @@ init_streams_for_device (struct ptx_device *ptx_de
null_stream->multithreaded = true;
null_stream->d = (CUdeviceptr) NULL;
null_stream->h = NULL;
- map_init (null_stream);
- ptx_dev->null_stream = null_stream;
+ if (!map_init (null_stream))
+ return false;
+ ptx_dev->null_stream = null_stream;
ptx_dev->active_streams = NULL;
pthread_mutex_init (&ptx_dev->stream_lock, NULL);
@@ -389,25 +409,35 @@ init_streams_for_device (struct ptx_device *ptx_de
for (i = 0; i < concurrency; i++)
ptx_dev->async_streams.arr[i] = NULL;
+
+ return true;
}
-static void
+static bool
fini_streams_for_device (struct ptx_device *ptx_dev)
{
free (ptx_dev->async_streams.arr);
+ bool ret = true;
while (ptx_dev->active_streams != NULL)
{
struct ptx_stream *s = ptx_dev->active_streams;
ptx_dev->active_streams = ptx_dev->active_streams->next;
- map_fini (s);
- cuStreamDestroy (s->stream);
+ ret &= map_fini (s);
+
+ CUresult r = cuStreamDestroy (s->stream);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
+ ret = false;
+ }
free (s);
}
- map_fini (ptx_dev->null_stream);
+ ret &= map_fini (ptx_dev->null_stream);
free (ptx_dev->null_stream);
+ return ret;
}
/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
@@ -481,7 +511,11 @@ select_stream_for_async (int async, pthread_t thre
{
r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
+ {
+ pthread_mutex_unlock (&ptx_dev->stream_lock);
+ GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
+ cuda_error (r));
+ }
}
/* If CREATE is true, we're going to be queueing some work on this
@@ -491,7 +525,11 @@ select_stream_for_async (int async, pthread_t thre
s->d = (CUdeviceptr) NULL;
s->h = NULL;
- map_init (s);
+ if (!map_init (s))
+ {
+ pthread_mutex_unlock (&ptx_dev->stream_lock);
+ GOMP_PLUGIN_fatal ("map_init fail");
+ }
s->next = ptx_dev->active_streams;
ptx_dev->active_streams = s;
@@ -501,7 +539,11 @@ select_stream_for_async (int async, pthread_t thre
stream = ptx_dev->async_streams.arr[async];
}
else if (async < 0)
- GOMP_PLUGIN_fatal ("bad async %d", async);
+ {
+ if (create)
+ pthread_mutex_unlock (&ptx_dev->stream_lock);
+ GOMP_PLUGIN_fatal ("bad async %d", async);
+ }
if (create)
{
@@ -532,34 +574,25 @@ select_stream_for_async (int async, pthread_t thre
static bool
nvptx_init (void)
{
- CUresult r;
int ndevs;
if (instantiated_devices != 0)
return true;
- r = cuInit (0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
-
+ CUDA_CALL (cuInit, 0);
ptx_events = NULL;
-
pthread_mutex_init (&ptx_event_lock, NULL);
- r = cuDeviceGetCount (&ndevs);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
-
+ CUDA_CALL (cuDeviceGetCount, &ndevs);
ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
* ndevs);
-
return true;
}
/* Select the N'th PTX device for the current host thread. The device must
have been previously opened before calling this function. */
-static void
+static bool
nvptx_attach_host_thread_to_device (int n)
{
CUdevice dev;
@@ -569,34 +602,34 @@ nvptx_attach_host_thread_to_device (int n)
r = cuCtxGetDevice (&dev);
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
- GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
+ {
+ GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
+ return false;
+ }
if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
- return;
+ return true;
else
{
CUcontext old_ctx;
ptx_dev = ptx_devices[n];
- assert (ptx_dev);
+ if (!ptx_dev)
+ {
+ GOMP_PLUGIN_error ("device %d not found", n);
+ return false;
+ }
- r = cuCtxGetCurrent (&thd_ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+ CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
/* We don't necessarily have a current context (e.g. if it has been
destroyed. Pop it if we do though. */
if (thd_ctx != NULL)
- {
- r = cuCtxPopCurrent (&old_ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
- }
+ CUDA_CALL (cuCtxPopCurrent, &old_ctx);
- r = cuCtxPushCurrent (ptx_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
+ CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
}
+ return true;
}
static struct ptx_device *
@@ -607,9 +640,7 @@ nvptx_open_device (int n)
CUresult r;
int async_engines, pi;
- r = cuDeviceGet (&dev, n);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
+ CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
@@ -619,60 +650,44 @@ nvptx_open_device (int n)
r = cuCtxGetDevice (&ctx_dev);
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
- GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
+ {
+ GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
+ return NULL;
+ }
if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
{
/* The current host thread has an active context for a different device.
Detach it. */
CUcontext old_ctx;
-
- r = cuCtxPopCurrent (&old_ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
+ CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
}
- r = cuCtxGetCurrent (&ptx_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+ CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
if (!ptx_dev->ctx)
- {
- r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
- }
+ CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
else
ptx_dev->ctx_shared = true;
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
-
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
ptx_dev->overlap = pi;
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
-
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
ptx_dev->map = pi;
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
-
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
ptx_dev->concur = pi;
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
-
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
ptx_dev->mode = pi;
- r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
-
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
ptx_dev->mkern = pi;
r = cuDeviceGetAttribute (&async_engines,
@@ -683,38 +698,34 @@ nvptx_open_device (int n)
ptx_dev->images = NULL;
pthread_mutex_init (&ptx_dev->image_lock, NULL);
- init_streams_for_device (ptx_dev, async_engines);
+ if (!init_streams_for_device (ptx_dev, async_engines))
+ return NULL;
return ptx_dev;
}
-static void
+static bool
nvptx_close_device (struct ptx_device *ptx_dev)
{
- CUresult r;
-
if (!ptx_dev)
- return;
+ return true;
- fini_streams_for_device (ptx_dev);
+ if (!fini_streams_for_device (ptx_dev))
+ return false;
pthread_mutex_destroy (&ptx_dev->image_lock);
if (!ptx_dev->ctx_shared)
- {
- r = cuCtxDestroy (ptx_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
- }
+ CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
free (ptx_dev);
+ return true;
}
static int
nvptx_get_num_devices (void)
{
int n;
- CUresult r;
/* PR libgomp/65099: Currently, we only support offloading in 64-bit
configurations. */
@@ -727,22 +738,19 @@ nvptx_get_num_devices (void)
further initialization). */
if (instantiated_devices == 0)
{
- r = cuInit (0);
+ CUresult r = cuInit (0);
/* This is not an error: e.g. we may have CUDA libraries installed but
no devices available. */
if (r != CUDA_SUCCESS)
return 0;
}
- r = cuDeviceGetCount (&n);
- if (r!= CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
-
+ CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
return n;
}
-static void
+static bool
link_ptx (CUmodule *module, const char *ptx_code)
{
CUjit_option opts[7];
@@ -780,9 +788,7 @@ link_ptx (CUmodule *module, const char *ptx_code)
opts[6] = CU_JIT_TARGET;
optvals[6] = (void *) CU_TARGET_COMPUTE_30;
- r = cuLinkCreate (7, opts, optvals, &linkstate);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
+ CUDA_CALL (cuLinkCreate, 7, opts, optvals, &linkstate);
char *abort_ptx = ABORT_PTX;
r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
@@ -790,7 +796,8 @@ link_ptx (CUmodule *module, const char *ptx_code)
if (r != CUDA_SUCCESS)
{
GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
+ GOMP_PLUGIN_error ("cuLinkAddData (abort) error: %s", cuda_error (r));
+ return false;
}
char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
@@ -799,8 +806,9 @@ link_ptx (CUmodule *module, const char *ptx_code)
if (r != CUDA_SUCCESS)
{
GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
+ GOMP_PLUGIN_error ("cuLinkAddData (acc_on_device) error: %s",
cuda_error (r));
+ return false;
}
char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
@@ -809,29 +817,28 @@ link_ptx (CUmodule *module, const char *ptx_code)
if (r != CUDA_SUCCESS)
{
GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
+ GOMP_PLUGIN_error ("cuLinkAddData (goacc_internal_ptx) error: %s",
cuda_error (r));
+ return false;
}
/* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */
r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char *)ptx_code,
- strlen (ptx_code) + 1, 0, 0, 0, 0);
+ strlen (ptx_code) + 1, 0, 0, 0, 0);
if (r != CUDA_SUCCESS)
{
GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
- GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
+ GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
+ return false;
}
- r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
+ CUDA_CALL (cuLinkComplete, linkstate, &linkout, &linkoutsize);
GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
- r = cuModuleLoadData (module, linkout);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
+ CUDA_CALL (cuModuleLoadData, module, linkout);
+ return true;
}
static void
@@ -961,11 +968,11 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
/* Copy the (device) pointers to arguments to the device (dp and hp might in
fact have the same value on a unified-memory system). */
- r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
+ mapnum * sizeof (void *));
- GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
+ GOMP_PLUGIN_debug (0, " %s: kernel %s: launch\n", __FUNCTION__,
+ targ_fn->name);
// OpenACC CUDA
//
@@ -991,12 +998,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
nthreads_in_block = vector_length;
kargs[0] = &dp;
- r = cuLaunchKernel (function,
- num_gangs, 1, 1,
- nthreads_in_block, 1, 1,
- 0, dev_str->stream, kargs, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuLaunchKernel, function,
+ num_gangs, 1, 1,
+ nthreads_in_block, 1, 1,
+ 0, dev_str->stream, kargs, 0);
#ifndef DISABLE_ASYNC
if (async < acc_async_noval)
@@ -1023,9 +1028,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
event_gc (true);
- r = cuEventRecord (*e, dev_str->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
event_add (PTX_EVT_KNL, e, (void *)dev_str);
}
@@ -1049,167 +1052,157 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
void * openacc_get_current_cuda_context (void);
-static void *
-nvptx_alloc (size_t s)
+static bool
+nvptx_alloc (size_t s, void **ptr)
{
CUdeviceptr d;
CUresult r;
r = cuMemAlloc (&d, s);
if (r == CUDA_ERROR_OUT_OF_MEMORY)
- return 0;
+ {
+ *ptr = NULL;
+ return true;
+ }
+
if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
- return (void *)d;
+ {
+ GOMP_PLUGIN_error ("cuMemAlloc error: %s", cuda_error (r));
+ return false;
+ }
+
+ *ptr = (void *)d;
+ return true;
}
-static void
+static bool
nvptx_free (void *p)
{
- CUresult r;
CUdeviceptr pb;
size_t ps;
- r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+ CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
+ if ((CUdeviceptr) p != pb)
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
- if ((CUdeviceptr)p != pb)
- GOMP_PLUGIN_fatal ("invalid device address");
-
- r = cuMemFree ((CUdeviceptr)p);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
+ CUDA_CALL (cuMemFree, (CUdeviceptr) p);
+ return true;
}
-static void *
+
+static bool
nvptx_host2dev (void *d, const void *h, size_t s)
{
- CUresult r;
CUdeviceptr pb;
size_t ps;
struct nvptx_thread *nvthd = nvptx_thread ();
if (!s)
- return 0;
-
+ return true;
if (!d)
- GOMP_PLUGIN_fatal ("invalid device address");
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
- r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+ CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
if (!pb)
- GOMP_PLUGIN_fatal ("invalid device address");
-
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
if (!h)
- GOMP_PLUGIN_fatal ("invalid host address");
-
+ {
+ GOMP_PLUGIN_error ("invalid host address");
+ return false;
+ }
if (d == h)
- GOMP_PLUGIN_fatal ("invalid host or device address");
-
+ {
+ GOMP_PLUGIN_error ("invalid host or device address");
+ return false;
+ }
if ((void *)(d + s) > (void *)(pb + ps))
- GOMP_PLUGIN_fatal ("invalid size");
+ {
+ GOMP_PLUGIN_error ("invalid size");
+ return false;
+ }
#ifndef DISABLE_ASYNC
if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
{
- CUevent *e;
-
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
-
+ CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+ CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
event_gc (false);
-
- r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
- nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
-
- r = cuEventRecord (*e, nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
-
+ CUDA_CALL (cuMemcpyHtoDAsync,
+ (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
+ CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
event_add (PTX_EVT_MEM, e, (void *)h);
}
else
#endif
- {
- r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
- }
+ CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
- return 0;
+ return true;
}
-static void *
+static bool
nvptx_dev2host (void *h, const void *d, size_t s)
{
- CUresult r;
CUdeviceptr pb;
size_t ps;
struct nvptx_thread *nvthd = nvptx_thread ();
if (!s)
- return 0;
-
+ return true;
if (!d)
- GOMP_PLUGIN_fatal ("invalid device address");
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
- r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
+ CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
if (!pb)
- GOMP_PLUGIN_fatal ("invalid device address");
-
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
if (!h)
- GOMP_PLUGIN_fatal ("invalid host address");
-
+ {
+ GOMP_PLUGIN_error ("invalid host address");
+ return false;
+ }
if (d == h)
- GOMP_PLUGIN_fatal ("invalid host or device address");
-
+ {
+ GOMP_PLUGIN_error ("invalid host or device address");
+ return false;
+ }
if ((void *)(d + s) > (void *)(pb + ps))
- GOMP_PLUGIN_fatal ("invalid size");
+ {
+ GOMP_PLUGIN_error ("invalid size");
+ return false;
+ }
#ifndef DISABLE_ASYNC
if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
{
- CUevent *e;
-
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
-
+ CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+ CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
event_gc (false);
-
- r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
- nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
-
- r = cuEventRecord (*e, nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
-
+ CUDA_CALL (cuMemcpyDtoHAsync,
+ h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
+ CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
event_add (PTX_EVT_MEM, e, (void *)h);
}
else
#endif
- {
- r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
- }
+ CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
- return 0;
+ return true;
}
static void
@@ -1279,17 +1272,13 @@ nvptx_async_test_all (void)
static void
nvptx_wait (int async)
{
- CUresult r;
struct ptx_stream *s;
s = select_stream_for_async (async, pthread_self (), false, NULL);
-
if (!s)
GOMP_PLUGIN_fatal ("unknown async %d", async);
- r = cuStreamSynchronize (s->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
event_gc (true);
}
@@ -1297,7 +1286,6 @@ nvptx_wait (int async)
static void
nvptx_wait_async (int async1, int async2)
{
- CUresult r;
CUevent *e;
struct ptx_stream *s1, *s2;
pthread_t self = pthread_self ();
@@ -1313,23 +1301,17 @@ nvptx_wait_async (int async1, int async2)
if (s1 == s2)
GOMP_PLUGIN_fatal ("identical parameters");
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
+ e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
event_gc (true);
- r = cuEventRecord (*e, s1->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
event_add (PTX_EVT_SYNC, e, NULL);
- r = cuStreamWaitEvent (s2->stream, *e, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
}
static void
@@ -1354,9 +1336,7 @@ nvptx_wait_all (void)
else if (r != CUDA_ERROR_NOT_READY)
GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
- r = cuStreamSynchronize (s->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
}
}
@@ -1368,7 +1348,6 @@ nvptx_wait_all (void)
static void
nvptx_wait_all_async (int async)
{
- CUresult r;
struct ptx_stream *waiting_stream, *other_stream;
CUevent *e;
struct nvptx_thread *nvthd = nvptx_thread ();
@@ -1398,20 +1377,14 @@ nvptx_wait_all_async (int async)
e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
/* Record an event on the waited-for stream. */
- r = cuEventRecord (*e, other_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
event_add (PTX_EVT_SYNC, e, NULL);
- r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
}
pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
@@ -1460,11 +1433,11 @@ nvptx_set_cuda_stream (int async, void *stream)
pthread_t self = pthread_self ();
struct nvptx_thread *nvthd = nvptx_thread ();
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
if (async < 0)
GOMP_PLUGIN_fatal ("bad async %d", async);
+ pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
+
/* We have a list of active streams and an array mapping async values to
entries of that list. We need to take "ownership" of the passed-in stream,
and add it to our list, removing the previous entry also (if there was one)
@@ -1487,8 +1460,11 @@ nvptx_set_cuda_stream (int async, void *stream)
s->next = s->next->next;
}
- cuStreamDestroy (oldstream->stream);
- map_fini (oldstream);
+ CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
+
+ if (!map_fini (oldstream))
+ GOMP_PLUGIN_fatal ("error when freeing host memory");
+
free (oldstream);
}
@@ -1525,37 +1501,50 @@ GOMP_OFFLOAD_get_num_devices (void)
return nvptx_get_num_devices ();
}
-void
+bool
GOMP_OFFLOAD_init_device (int n)
{
+ struct ptx_device *dev;
+
pthread_mutex_lock (&ptx_dev_lock);
if (!nvptx_init () || ptx_devices[n] != NULL)
{
pthread_mutex_unlock (&ptx_dev_lock);
- return;
+ return false;
}
- ptx_devices[n] = nvptx_open_device (n);
- instantiated_devices++;
+ dev = nvptx_open_device (n);
+ if (dev)
+ {
+ ptx_devices[n] = dev;
+ instantiated_devices++;
+ }
pthread_mutex_unlock (&ptx_dev_lock);
+
+ return dev != NULL;
}
-void
+bool
GOMP_OFFLOAD_fini_device (int n)
{
pthread_mutex_lock (&ptx_dev_lock);
if (ptx_devices[n] != NULL)
{
- nvptx_attach_host_thread_to_device (n);
- nvptx_close_device (ptx_devices[n]);
+ if (!nvptx_attach_host_thread_to_device (n)
+ || !nvptx_close_device (ptx_devices[n]))
+ {
+ pthread_mutex_unlock (&ptx_dev_lock);
+ return false;
+ }
ptx_devices[n] = NULL;
instantiated_devices--;
}
pthread_mutex_unlock (&ptx_dev_lock);
+ return true;
}
/* Data emitted by mkoffload. */
@@ -1590,7 +1579,6 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version
CUmodule module;
const char *const *fn_names, *const *var_names;
unsigned int fn_entries, var_entries, i, j;
- CUresult r;
struct targ_fn_descriptor *targ_fns;
struct addr_pair *targ_tbl;
const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
@@ -1598,18 +1586,19 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version
struct ptx_device *dev;
if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
- GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
- " (expected %u, received %u)",
- GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
-
- GOMP_OFFLOAD_init_device (ord);
+ {
+ GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
+ " (expected %u, received %u)",
+ GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
+ return -1;
+ }
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !link_ptx (&module, img_header->ptx_src))
+ return -1;
+
dev = ptx_devices[ord];
-
- nvptx_attach_host_thread_to_device (ord);
- link_ptx (&module, img_header->ptx_src);
-
/* The mkoffload utility emits a struct of pointers/integers at the
start of each offload image. The array of kernel names and the
functions addresses form a one-to-one correspondence. */
@@ -1639,11 +1628,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version
for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
{
CUfunction function;
+ CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module, fn_names[i]);
- r = cuModuleGetFunction (&function, module, fn_names[i]);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
-
targ_fns->fn = function;
targ_fns->name = (const char *) fn_names[i];
@@ -1656,9 +1642,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version
CUdeviceptr var;
size_t bytes;
- r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
+ CUDA_CALL_ERET (-1, cuModuleGetGlobal,
+ &var, &bytes, module, var_names[j]);
targ_tbl->start = (uintptr_t) var;
targ_tbl->end = targ_tbl->start + bytes;
@@ -1692,32 +1677,32 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned versi
pthread_mutex_unlock (&dev->image_lock);
}
-void *
-GOMP_OFFLOAD_alloc (int ord, size_t size)
+bool
+GOMP_OFFLOAD_alloc (int ord, size_t size, void **ptr)
{
- nvptx_attach_host_thread_to_device (ord);
- return nvptx_alloc (size);
+ return (nvptx_attach_host_thread_to_device (ord)
+ && nvptx_alloc (size, ptr));
}
-void
+bool
GOMP_OFFLOAD_free (int ord, void *ptr)
{
- nvptx_attach_host_thread_to_device (ord);
- nvptx_free (ptr);
+ return (nvptx_attach_host_thread_to_device (ord)
+ && nvptx_free (ptr));
}
-void *
+bool
GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
{
- nvptx_attach_host_thread_to_device (ord);
- return nvptx_dev2host (dst, src, n);
+ return (nvptx_attach_host_thread_to_device (ord)
+ && nvptx_dev2host (dst, src, n));
}
-void *
+bool
GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
{
- nvptx_attach_host_thread_to_device (ord);
- return nvptx_host2dev (dst, src, n);
+ return (nvptx_attach_host_thread_to_device (ord)
+ && nvptx_host2dev (dst, src, n));
}
void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
@@ -1736,20 +1721,11 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *)
void
GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
{
- CUevent *e;
- CUresult r;
struct nvptx_thread *nvthd = nvptx_thread ();
+ CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
- e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
-
- r = cuEventRecord (*e, nvthd->current_stream->stream);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
-
+ CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
+ CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
}
@@ -1801,25 +1777,18 @@ GOMP_OFFLOAD_openacc_create_thread_data (int ord)
struct ptx_device *ptx_dev;
struct nvptx_thread *nvthd
= GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
- CUresult r;
CUcontext thd_ctx;
ptx_dev = ptx_devices[ord];
assert (ptx_dev);
- r = cuCtxGetCurrent (&thd_ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
assert (ptx_dev->ctx);
if (!thd_ctx)
- {
- r = cuCtxPushCurrent (ptx_dev->ctx);
- if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
- }
+ CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
nvthd->current_stream = ptx_dev->null_stream;
nvthd->ptx_dev = ptx_dev;