commit 78d6b16bf258106282f791f2e7b3010bf75f2a86 Author: Julian Brown Date: Wed Oct 15 02:10:00 2014 -0700 Async fixes/improvements. diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c index a47617a..f44ca5e 100644 --- a/libgomp/oacc-host.c +++ b/libgomp/oacc-host.c @@ -294,6 +294,16 @@ openacc_parallel (void (*fn) (void *), size_t mapnum __attribute__((unused)), } STATIC void +openacc_register_async_cleanup (void *targ_mem_desc) +{ +#ifdef HOST_NONSHM_PLUGIN + /* "Asynchronous" launches are executed synchronously on the (non-SHM) host, + so there's no point in delaying host-side cleanup -- just do it now. */ + GOMP_PLUGIN_async_unmap_vars (targ_mem_desc); +#endif +} + +STATIC void openacc_async_set_async (int async __attribute__((unused))) { #ifdef DEBUG @@ -397,6 +407,8 @@ static struct gomp_device_descr host_dispatch = .exec_func = openacc_parallel, + .register_async_cleanup_func = openacc_register_async_cleanup, + .async_set_async_func = openacc_async_set_async, .async_test_func = openacc_async_test, .async_test_all_func = openacc_async_test_all, diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h index e1d2e32..03529cc 100644 --- a/libgomp/oacc-int.h +++ b/libgomp/oacc-int.h @@ -64,6 +64,9 @@ typedef struct ACC_dispatch_t void (*exec_func) (void (*) (void *), size_t, void **, void **, size_t *, unsigned short *, int, int, int, int, void *); + /* async cleanup callback registration */ + void (*register_async_cleanup_func) (void *); + /* asynchronous routines */ int (*async_test_func) (int); int (*async_test_all_func) (void); diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c index 57ac8de..e3f156c 100644 --- a/libgomp/oacc-parallel.c +++ b/libgomp/oacc-parallel.c @@ -213,7 +213,10 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target, if (async < acc_async_noval) gomp_unmap_vars (tgt, true); else - gomp_copy_from_async (tgt); + { + gomp_copy_from_async (tgt); + ACC_dev->openacc.register_async_cleanup_func (tgt); + } ACC_dev->openacc.async_set_async_func (acc_async_sync); } diff --git a/libgomp/plugin-nvptx.c b/libgomp/plugin-nvptx.c index e163f3a..f193229 100644 --- a/libgomp/plugin-nvptx.c +++ b/libgomp/plugin-nvptx.c @@ -317,7 +317,8 @@ enum PTX_event_type { PTX_EVT_MEM, PTX_EVT_KNL, - PTX_EVT_SYNC + PTX_EVT_SYNC, + PTX_EVT_ASYNC_CLEANUP }; struct PTX_event @@ -325,7 +326,6 @@ struct PTX_event CUevent *evt; int type; void *addr; - void *tgt; int ord; SLIST_ENTRY(PTX_event) next; }; @@ -946,6 +946,10 @@ event_gc (bool memmap_lockable) break; case PTX_EVT_KNL: + map_pop (ptx_event->addr); + break; + + case PTX_EVT_ASYNC_CLEANUP: { /* The function GOMP_PLUGIN_async_unmap_vars needs to claim the memory-map splay tree lock for the current device, so we @@ -955,9 +959,7 @@ event_gc (bool memmap_lockable) if (!memmap_lockable) goto next_event; - map_pop (ptx_event->addr); - if (ptx_event->tgt) - GOMP_PLUGIN_async_unmap_vars (ptx_event->tgt); + GOMP_PLUGIN_async_unmap_vars (ptx_event->addr); } break; } @@ -978,17 +980,17 @@ event_gc (bool memmap_lockable) } static void -event_add (enum PTX_event_type type, CUevent *e, void *h, void *tgt) +event_add (enum PTX_event_type type, CUevent *e, void *h) { struct PTX_event *ptx_event; - assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC); + assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC + || type == PTX_EVT_ASYNC_CLEANUP); ptx_event = GOMP_PLUGIN_malloc (sizeof (struct PTX_event)); ptx_event->type = type; ptx_event->evt = e; ptx_event->addr = h; - ptx_event->tgt = tgt; ptx_event->ord = PTX_dev->ord; GOMP_PLUGIN_mutex_lock (&PTX_event_lock); @@ -1092,7 +1094,7 @@ PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); - event_add (PTX_EVT_KNL, e, (void *)dev_str, targ_mem_desc); + event_add (PTX_EVT_KNL, e, (void *)dev_str); } #else r = cuCtxSynchronize (); @@ -1194,7 +1196,7 @@ PTX_host2dev (void *d, const void *h, size_t s) if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); - event_add (PTX_EVT_MEM, e, (void *)h, NULL); + event_add (PTX_EVT_MEM, e, (void *)h); } else #endif @@ -1257,7 +1259,7 @@ PTX_dev2host (void *h, const void *d, size_t s) if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); - event_add (PTX_EVT_MEM, e, (void *)h, NULL); + event_add (PTX_EVT_MEM, e, (void *)h); } else #endif @@ -1289,7 +1291,15 @@ PTX_async_test (int async) r = cuStreamQuery (s->stream); if (r == CUDA_SUCCESS) - return 1; + { + /* The oacc-parallel.c:goacc_wait function calls this hook to determine + whether all work has completed on this stream, and if so omits the call + to the wait hook. If that happens, event_gc might not get called + (which prevents variables from getting unmapped and their associated + device storage freed), so call it here. */ + event_gc (true); + return 1; + } else if (r == CUDA_ERROR_NOT_READY) return 0; @@ -1318,6 +1328,8 @@ PTX_async_test_all (void) GOMP_PLUGIN_mutex_unlock (&PTX_dev->stream_lock); + event_gc (true); + return 1; } @@ -1370,7 +1382,7 @@ PTX_wait_async (int async1, int async2) if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); - event_add (PTX_EVT_SYNC, e, NULL, NULL); + event_add (PTX_EVT_SYNC, e, NULL); r = cuStreamWaitEvent (s2->stream, *e, 0); if (r != CUDA_SUCCESS) @@ -1448,7 +1460,7 @@ PTX_wait_all_async (int async) if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); - event_add (PTX_EVT_SYNC, e, NULL, NULL); + event_add (PTX_EVT_SYNC, e, NULL); r = cuStreamWaitEvent (waiting_stream->stream, *e, 0); if (r != CUDA_SUCCESS) @@ -1771,6 +1783,30 @@ openacc_avail (void) return PTX_avail (); } +void +openacc_register_async_cleanup (void *targ_mem_desc) +{ + CUevent *e; + CUresult r; + +#ifdef DEBUG + fprintf (stderr, "libgomp plugin: %s:%s (%p)\n", __FILE__, __FUNCTION__, + targ_mem_desc); +#endif + + e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); + + r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuErrorMsg (r)); + + r = cuEventRecord (*e, current_stream->stream); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuErrorMsg (r)); + + event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc); +} + int openacc_async_test (int async) { diff --git a/libgomp/target.c b/libgomp/target.c index 418ba61..79b252d 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -1067,6 +1067,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, DLSYM_OPT (openacc.get_device_num, openacc_get_device_num); DLSYM_OPT (openacc.set_device_num, openacc_set_device_num); DLSYM_OPT (openacc.avail, openacc_avail); + DLSYM_OPT (openacc.register_async_cleanup, + openacc_register_async_cleanup); DLSYM_OPT (openacc.async_test, openacc_async_test); DLSYM_OPT (openacc.async_test_all, openacc_async_test_all); DLSYM_OPT (openacc.async_wait, openacc_async_wait);