[PATCH, OpenACC, libgomp, v6, stage1] Async-rework update
Chung-Lin Tang
chunglin_tang@mentor.com
Mon Feb 25 18:15:00 GMT 2019
Hi Thomas,
I have incorporated all your patches you've included in the last mail (with
some modifications, though pretty minor I think).
The default_async, GOMP_PLUGIN_IF_VERSION, and testsuite changes have all been removed.
We can work on them later as we clarify more things.
Thanks,
Chung-Lin
-------------- next part --------------
Index: libgomp/oacc-async.c
===================================================================
--- libgomp/oacc-async.c (revision 269183)
+++ libgomp/oacc-async.c (working copy)
@@ -27,49 +27,162 @@
<http://www.gnu.org/licenses/>. */
#include <assert.h>
+#include <string.h>
#include "openacc.h"
#include "libgomp.h"
#include "oacc-int.h"
-int
-acc_async_test (int async)
+static struct goacc_thread *
+get_goacc_thread (void)
{
- if (!async_valid_p (async))
- gomp_fatal ("invalid async argument: %d", async);
-
struct goacc_thread *thr = goacc_thread ();
if (!thr || !thr->dev)
gomp_fatal ("no device active");
- return thr->dev->openacc.async_test_func (async);
+ return thr;
}
-int
-acc_async_test_all (void)
+static struct gomp_device_descr *
+get_goacc_thread_device (void)
{
struct goacc_thread *thr = goacc_thread ();
if (!thr || !thr->dev)
gomp_fatal ("no device active");
- return thr->dev->openacc.async_test_all_func ();
+ return thr->dev;
}
-void
-acc_wait (int async)
+static int
+validate_async_val (int async)
{
if (!async_valid_p (async))
- gomp_fatal ("invalid async argument: %d", async);
+ gomp_fatal ("invalid async-argument: %d", async);
+ if (async == acc_async_sync)
+ return -1;
+
+ if (async == acc_async_noval)
+ return 0;
+
+ if (async >= 0)
+ /* TODO: we reserve 0 for acc_async_noval before we can clarify the
+ semantics of "default_async". */
+ return 1 + async;
+ else
+ __builtin_unreachable ();
+}
+
+/* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This
+ might return NULL if no asyncqueue is to be used. Otherwise, if CREATE,
+ create the asyncqueue if it doesn't exist yet. */
+
+attribute_hidden struct goacc_asyncqueue *
+lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
+{
+ async = validate_async_val (async);
+ if (async < 0)
+ return NULL;
+
+ struct goacc_asyncqueue *ret_aq = NULL;
+ struct gomp_device_descr *dev = thr->dev;
+
+ gomp_mutex_lock (&dev->openacc.async.lock);
+
+ if (!create
+ && (async >= dev->openacc.async.nasyncqueue
+ || !dev->openacc.async.asyncqueue[async]))
+ goto end;
+
+ if (async >= dev->openacc.async.nasyncqueue)
+ {
+ int diff = async + 1 - dev->openacc.async.nasyncqueue;
+ dev->openacc.async.asyncqueue
+ = gomp_realloc (dev->openacc.async.asyncqueue,
+ sizeof (goacc_aq) * (async + 1));
+ memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
+ 0, sizeof (goacc_aq) * diff);
+ dev->openacc.async.nasyncqueue = async + 1;
+ }
+
+ if (!dev->openacc.async.asyncqueue[async])
+ {
+ dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func ();
+
+ if (!dev->openacc.async.asyncqueue[async])
+ {
+ gomp_mutex_unlock (&dev->openacc.async.lock);
+ gomp_fatal ("async %d creation failed", async);
+ }
+
+ /* Link new async queue into active list. */
+ goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
+ n->aq = dev->openacc.async.asyncqueue[async];
+ n->next = dev->openacc.async.active;
+ dev->openacc.async.active = n;
+ }
+
+ ret_aq = dev->openacc.async.asyncqueue[async];
+
+ end:
+ gomp_mutex_unlock (&dev->openacc.async.lock);
+ return ret_aq;
+}
+
+/* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This
+ might return NULL if no asyncqueue is to be used. Otherwise, create the
+ asyncqueue if it doesn't exist yet. */
+
+attribute_hidden struct goacc_asyncqueue *
+get_goacc_asyncqueue (int async)
+{
+ struct goacc_thread *thr = get_goacc_thread ();
+ return lookup_goacc_asyncqueue (thr, true, async);
+}
+
+int
+acc_async_test (int async)
+{
struct goacc_thread *thr = goacc_thread ();
if (!thr || !thr->dev)
gomp_fatal ("no device active");
- thr->dev->openacc.async_wait_func (async);
+ goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
+ if (!aq)
+ return 1;
+ else
+ return thr->dev->openacc.async.test_func (aq);
}
+int
+acc_async_test_all (void)
+{
+ struct goacc_thread *thr = get_goacc_thread ();
+
+ int ret = 1;
+ gomp_mutex_lock (&thr->dev->openacc.async.lock);
+ for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+ if (!thr->dev->openacc.async.test_func (l->aq))
+ {
+ ret = 0;
+ break;
+ }
+ gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+ return ret;
+}
+
+void
+acc_wait (int async)
+{
+ struct goacc_thread *thr = get_goacc_thread ();
+
+ goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
+ if (aq && !thr->dev->openacc.async.synchronize_func (aq))
+ gomp_fatal ("wait on %d failed", async);
+}
+
/* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. */
#ifdef HAVE_ATTRIBUTE_ALIAS
strong_alias (acc_wait, acc_async_wait)
@@ -84,23 +197,46 @@ acc_async_wait (int async)
void
acc_wait_async (int async1, int async2)
{
- struct goacc_thread *thr = goacc_thread ();
+ struct goacc_thread *thr = get_goacc_thread ();
- if (!thr || !thr->dev)
- gomp_fatal ("no device active");
+ goacc_aq aq1 = lookup_goacc_asyncqueue (thr, false, async1);
+ /* TODO: Is this also correct for acc_async_sync, assuming that in this case,
+ we'll always be synchronous anyways? */
+ if (!aq1)
+ return;
- thr->dev->openacc.async_wait_async_func (async1, async2);
+ goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
+ /* An async queue is always synchronized with itself. */
+ if (aq1 == aq2)
+ return;
+
+ if (aq2)
+ {
+ if (!thr->dev->openacc.async.serialize_func (aq1, aq2))
+ gomp_fatal ("ordering of async ids %d and %d failed", async1, async2);
+ }
+ else
+ {
+ /* TODO: Local thread synchronization.
+ Necessary for the "async2 == acc_async_sync" case, or can just skip? */
+ if (!thr->dev->openacc.async.synchronize_func (aq1))
+ gomp_fatal ("wait on %d failed", async1);
+ }
}
void
acc_wait_all (void)
{
- struct goacc_thread *thr = goacc_thread ();
+ struct gomp_device_descr *dev = get_goacc_thread_device ();
- if (!thr || !thr->dev)
- gomp_fatal ("no device active");
+ bool ret = true;
+ gomp_mutex_lock (&dev->openacc.async.lock);
+ for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
+ ret &= dev->openacc.async.synchronize_func (l->aq);
+ gomp_mutex_unlock (&dev->openacc.async.lock);
- thr->dev->openacc.async_wait_all_func ();
+ if (!ret)
+ gomp_fatal ("wait all failed");
}
/* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all. */
@@ -117,13 +253,73 @@ acc_async_wait_all (void)
void
acc_wait_all_async (int async)
{
- if (!async_valid_p (async))
- gomp_fatal ("invalid async argument: %d", async);
+ struct goacc_thread *thr = get_goacc_thread ();
- struct goacc_thread *thr = goacc_thread ();
+ goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
- if (!thr || !thr->dev)
- gomp_fatal ("no device active");
+ bool ret = true;
+ gomp_mutex_lock (&thr->dev->openacc.async.lock);
+ for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
+ {
+ if (waiting_queue)
+ ret &= thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
+ else
+ /* TODO: Local thread synchronization.
+ Necessary for the "async2 == acc_async_sync" case, or can just skip? */
+ ret &= thr->dev->openacc.async.synchronize_func (l->aq);
+ }
+ gomp_mutex_unlock (&thr->dev->openacc.async.lock);
- thr->dev->openacc.async_wait_all_async_func (async);
+ if (!ret)
+ gomp_fatal ("wait all async(%d) failed", async);
}
+
+attribute_hidden void
+goacc_async_free (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq, void *ptr)
+{
+ if (!aq)
+ free (ptr);
+ else
+ devicep->openacc.async.queue_callback_func (aq, free, ptr);
+}
+
+/* This function initializes the asyncqueues for the device specified by
+ DEVICEP. TODO DEVICEP must be locked on entry, and remains locked on
+ return. */
+
+attribute_hidden void
+goacc_init_asyncqueues (struct gomp_device_descr *devicep)
+{
+ devicep->openacc.async.nasyncqueue = 0;
+ devicep->openacc.async.asyncqueue = NULL;
+ devicep->openacc.async.active = NULL;
+ gomp_mutex_init (&devicep->openacc.async.lock);
+}
+
+/* This function finalizes the asyncqueues for the device specified by DEVICEP.
+ TODO DEVICEP must be locked on entry, and remains locked on return. */
+
+attribute_hidden bool
+goacc_fini_asyncqueues (struct gomp_device_descr *devicep)
+{
+ bool ret = true;
+ gomp_mutex_lock (&devicep->openacc.async.lock);
+ if (devicep->openacc.async.nasyncqueue > 0)
+ {
+ goacc_aq_list next;
+ for (goacc_aq_list l = devicep->openacc.async.active; l; l = next)
+ {
+ ret &= devicep->openacc.async.destruct_func (l->aq);
+ next = l->next;
+ free (l);
+ }
+ free (devicep->openacc.async.asyncqueue);
+ devicep->openacc.async.nasyncqueue = 0;
+ devicep->openacc.async.asyncqueue = NULL;
+ devicep->openacc.async.active = NULL;
+ }
+ gomp_mutex_unlock (&devicep->openacc.async.lock);
+ gomp_mutex_destroy (&devicep->openacc.async.lock);
+ return ret;
+}
Index: libgomp/oacc-plugin.c
===================================================================
--- libgomp/oacc-plugin.c (revision 269183)
+++ libgomp/oacc-plugin.c (working copy)
@@ -30,15 +30,12 @@
#include "oacc-plugin.h"
#include "oacc-int.h"
+/* This plugin function is now obsolete. */
void
-GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
+GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
+ int async __attribute__((unused)))
{
- struct target_mem_desc *tgt = ptr;
- struct gomp_device_descr *devicep = tgt->device_descr;
-
- devicep->openacc.async_set_async_func (async);
- gomp_unmap_vars (tgt, true);
- devicep->openacc.async_set_async_func (acc_async_sync);
+ gomp_fatal ("invalid plugin function");
}
/* Return the target-specific part of the TLS data for the current thread. */
Index: libgomp/plugin/cuda/cuda.h
===================================================================
--- libgomp/plugin/cuda/cuda.h (revision 269183)
+++ libgomp/plugin/cuda/cuda.h (working copy)
@@ -54,7 +54,11 @@ typedef enum {
CUDA_ERROR_INVALID_CONTEXT = 201,
CUDA_ERROR_NOT_FOUND = 500,
CUDA_ERROR_NOT_READY = 600,
- CUDA_ERROR_LAUNCH_FAILED = 719
+ CUDA_ERROR_LAUNCH_FAILED = 719,
+ CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
+ CUDA_ERROR_NOT_PERMITTED = 800,
+ CUDA_ERROR_NOT_SUPPORTED = 801,
+ CUDA_ERROR_UNKNOWN = 999
} CUresult;
typedef enum {
@@ -173,6 +177,8 @@ CUresult cuModuleLoadData (CUmodule *, const void
CUresult cuModuleUnload (CUmodule);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
+typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int);
CUresult cuStreamCreate (CUstream *, unsigned);
#define cuStreamDestroy cuStreamDestroy_v2
CUresult cuStreamDestroy (CUstream);
Index: libgomp/plugin/plugin-nvptx.c
===================================================================
--- libgomp/plugin/plugin-nvptx.c (revision 269183)
+++ libgomp/plugin/plugin-nvptx.c (working copy)
@@ -192,175 +192,30 @@ cuda_error (CUresult r)
static unsigned int instantiated_devices = 0;
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
-struct cuda_map
+/* NVPTX/CUDA specific definition of asynchronous queues. */
+struct goacc_asyncqueue
{
- CUdeviceptr d;
- size_t size;
- bool active;
- struct cuda_map *next;
+ CUstream cuda_stream;
};
-struct ptx_stream
+struct nvptx_callback
{
- CUstream stream;
- pthread_t host_thread;
- bool multithreaded;
- struct cuda_map *map;
- struct ptx_stream *next;
+ void (*fn) (void *);
+ void *ptr;
+ struct goacc_asyncqueue *aq;
+ struct nvptx_callback *next;
};
/* Thread-specific data for PTX. */
struct nvptx_thread
{
- struct ptx_stream *current_stream;
+ /* We currently have this embedded inside the plugin because libgomp manages
+ devices through integer target_ids. This might be better if using an
+ opaque target-specific pointer directly from gomp_device_descr. */
struct ptx_device *ptx_dev;
};
-static struct cuda_map *
-cuda_map_create (size_t size)
-{
- struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
-
- assert (map);
-
- map->next = NULL;
- map->size = size;
- map->active = false;
-
- CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
- assert (map->d);
-
- return map;
-}
-
-static void
-cuda_map_destroy (struct cuda_map *map)
-{
- if (map->active)
- /* Possible reasons for the map to be still active:
- - the associated async kernel might still be running.
- - the associated async kernel might have finished, but the
- corresponding event that should trigger the pop_map has not been
- processed by event_gc.
- - the associated sync kernel might have aborted
-
- The async cases could happen if the user specified an async region
- without adding a corresponding wait that is guaranteed to be executed
- (before returning from main, or in an atexit handler).
- We do not want to deallocate a device pointer that is still being
- used, so skip it.
-
- In the sync case, the device pointer is no longer used, but deallocating
- it using cuMemFree will not succeed, so skip it.
-
- TODO: Handle this in a more constructive way, by f.i. waiting for streams
- to finish before de-allocating them (PR88981), or by ensuring the CUDA
- lib atexit handler is called before rather than after the libgomp plugin
- atexit handler (PR83795). */
- ;
- else
- CUDA_CALL_NOCHECK (cuMemFree, map->d);
-
- free (map);
-}
-
-/* The following map_* routines manage the CUDA device memory that
- contains the data mapping arguments for cuLaunchKernel. Each
- asynchronous PTX stream may have multiple pending kernel
- invocations, which are launched in a FIFO order. As such, the map
- routines maintains a queue of cuLaunchKernel arguments.
-
- Calls to map_push and map_pop must be guarded by ptx_event_lock.
- Likewise, calls to map_init and map_fini are guarded by
- ptx_dev_lock inside GOMP_OFFLOAD_init_device and
- GOMP_OFFLOAD_fini_device, respectively. */
-
-static bool
-map_init (struct ptx_stream *s)
-{
- int size = getpagesize ();
-
- assert (s);
-
- s->map = cuda_map_create (size);
-
- return true;
-}
-
-static bool
-map_fini (struct ptx_stream *s)
-{
- assert (s->map->next == NULL);
-
- cuda_map_destroy (s->map);
-
- return true;
-}
-
-static void
-map_pop (struct ptx_stream *s)
-{
- struct cuda_map *next;
-
- assert (s != NULL);
-
- if (s->map->next == NULL)
- {
- s->map->active = false;
- return;
- }
-
- next = s->map->next;
- cuda_map_destroy (s->map);
- s->map = next;
-}
-
-static CUdeviceptr
-map_push (struct ptx_stream *s, size_t size)
-{
- struct cuda_map *map = NULL;
- struct cuda_map **t;
-
- assert (s);
- assert (s->map);
-
- /* Select an element to push. */
- if (s->map->active)
- map = cuda_map_create (size);
- else
- {
- /* Pop the inactive front element. */
- struct cuda_map *pop = s->map;
- s->map = pop->next;
- pop->next = NULL;
-
- if (pop->size < size)
- {
- cuda_map_destroy (pop);
-
- map = cuda_map_create (size);
- }
- else
- map = pop;
- }
-
- /* Check that the element is as expected. */
- assert (map->next == NULL);
- assert (!map->active);
-
- /* Mark the element active. */
- map->active = true;
-
- /* Push the element to the back of the list. */
- for (t = &s->map; (*t) != NULL; t = &(*t)->next)
- ;
- assert (t != NULL && *t == NULL);
- *t = map;
-
- return map->d;
-}
-
/* Target data function launch information. */
struct targ_fn_launch
@@ -412,22 +267,18 @@ struct ptx_image_data
struct ptx_image_data *next;
};
+struct ptx_free_block
+{
+ void *ptr;
+ struct ptx_free_block *next;
+};
+
struct ptx_device
{
CUcontext ctx;
bool ctx_shared;
CUdevice dev;
- struct ptx_stream *null_stream;
- /* All non-null streams associated with this device (actually context),
- either created implicitly or passed in from the user (via
- acc_set_cuda_stream). */
- struct ptx_stream *active_streams;
- struct {
- struct ptx_stream **arr;
- int size;
- } async_streams;
- /* A lock for use when manipulating the above stream list and array. */
- pthread_mutex_t stream_lock;
+
int ord;
bool overlap;
bool map;
@@ -445,32 +296,13 @@ struct ptx_device
struct ptx_image_data *images; /* Images loaded on device. */
pthread_mutex_t image_lock; /* Lock for above list. */
-
- struct ptx_device *next;
-};
-enum ptx_event_type
-{
- PTX_EVT_MEM,
- PTX_EVT_KNL,
- PTX_EVT_SYNC,
- PTX_EVT_ASYNC_CLEANUP
-};
+ struct ptx_free_block *free_blocks;
+ pthread_mutex_t free_blocks_lock;
-struct ptx_event
-{
- CUevent *evt;
- int type;
- void *addr;
- int ord;
- int val;
-
- struct ptx_event *next;
+ struct ptx_device *next;
};
-static pthread_mutex_t ptx_event_lock;
-static struct ptx_event *ptx_events;
-
static struct ptx_device **ptx_devices;
static inline struct nvptx_thread *
@@ -479,193 +311,6 @@ nvptx_thread (void)
return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
}
-static bool
-init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
-{
- int i;
- struct ptx_stream *null_stream
- = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
-
- null_stream->stream = NULL;
- null_stream->host_thread = pthread_self ();
- null_stream->multithreaded = true;
- if (!map_init (null_stream))
- return false;
-
- ptx_dev->null_stream = null_stream;
- ptx_dev->active_streams = NULL;
- pthread_mutex_init (&ptx_dev->stream_lock, NULL);
-
- if (concurrency < 1)
- concurrency = 1;
-
- /* This is just a guess -- make space for as many async streams as the
- current device is capable of concurrently executing. This can grow
- later as necessary. No streams are created yet. */
- ptx_dev->async_streams.arr
- = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
- ptx_dev->async_streams.size = concurrency;
-
- for (i = 0; i < concurrency; i++)
- ptx_dev->async_streams.arr[i] = NULL;
-
- return true;
-}
-
-static bool
-fini_streams_for_device (struct ptx_device *ptx_dev)
-{
- free (ptx_dev->async_streams.arr);
-
- bool ret = true;
- while (ptx_dev->active_streams != NULL)
- {
- struct ptx_stream *s = ptx_dev->active_streams;
- ptx_dev->active_streams = ptx_dev->active_streams->next;
-
- ret &= map_fini (s);
-
- CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
- if (r != CUDA_SUCCESS)
- {
- GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
- ret = false;
- }
- free (s);
- }
-
- ret &= map_fini (ptx_dev->null_stream);
- free (ptx_dev->null_stream);
- return ret;
-}
-
-/* Select a stream for (OpenACC-semantics) ASYNC argument for the current
- thread THREAD (and also current device/context). If CREATE is true, create
- the stream if it does not exist (or use EXISTING if it is non-NULL), and
- associate the stream with the same thread argument. Returns stream to use
- as result. */
-
-static struct ptx_stream *
-select_stream_for_async (int async, pthread_t thread, bool create,
- CUstream existing)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
- /* Local copy of TLS variable. */
- struct ptx_device *ptx_dev = nvthd->ptx_dev;
- struct ptx_stream *stream = NULL;
- int orig_async = async;
-
- /* The special value acc_async_noval (-1) maps (for now) to an
- implicitly-created stream, which is then handled the same as any other
- numbered async stream. Other options are available, e.g. using the null
- stream for anonymous async operations, or choosing an idle stream from an
- active set. But, stick with this for now. */
- if (async > acc_async_sync)
- async++;
-
- if (create)
- pthread_mutex_lock (&ptx_dev->stream_lock);
-
- /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
- null stream, and in fact better performance may be obtainable if it doesn't
- (because the null stream enforces overly-strict synchronisation with
- respect to other streams for legacy reasons, and that's probably not
- needed with OpenACC). Maybe investigate later. */
- if (async == acc_async_sync)
- stream = ptx_dev->null_stream;
- else if (async >= 0 && async < ptx_dev->async_streams.size
- && ptx_dev->async_streams.arr[async] && !(create && existing))
- stream = ptx_dev->async_streams.arr[async];
- else if (async >= 0 && create)
- {
- if (async >= ptx_dev->async_streams.size)
- {
- int i, newsize = ptx_dev->async_streams.size * 2;
-
- if (async >= newsize)
- newsize = async + 1;
-
- ptx_dev->async_streams.arr
- = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
- newsize * sizeof (struct ptx_stream *));
-
- for (i = ptx_dev->async_streams.size; i < newsize; i++)
- ptx_dev->async_streams.arr[i] = NULL;
-
- ptx_dev->async_streams.size = newsize;
- }
-
- /* Create a new stream on-demand if there isn't one already, or if we're
- setting a particular async value to an existing (externally-provided)
- stream. */
- if (!ptx_dev->async_streams.arr[async] || existing)
- {
- CUresult r;
- struct ptx_stream *s
- = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
-
- if (existing)
- s->stream = existing;
- else
- {
- r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
- CU_STREAM_DEFAULT);
- if (r != CUDA_SUCCESS)
- {
- pthread_mutex_unlock (&ptx_dev->stream_lock);
- GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
- cuda_error (r));
- }
- }
-
- /* If CREATE is true, we're going to be queueing some work on this
- stream. Associate it with the current host thread. */
- s->host_thread = thread;
- s->multithreaded = false;
-
- if (!map_init (s))
- {
- pthread_mutex_unlock (&ptx_dev->stream_lock);
- GOMP_PLUGIN_fatal ("map_init fail");
- }
-
- s->next = ptx_dev->active_streams;
- ptx_dev->active_streams = s;
- ptx_dev->async_streams.arr[async] = s;
- }
-
- stream = ptx_dev->async_streams.arr[async];
- }
- else if (async < 0)
- {
- if (create)
- pthread_mutex_unlock (&ptx_dev->stream_lock);
- GOMP_PLUGIN_fatal ("bad async %d", async);
- }
-
- if (create)
- {
- assert (stream != NULL);
-
- /* If we're trying to use the same stream from different threads
- simultaneously, set stream->multithreaded to true. This affects the
- behaviour of acc_async_test_all and acc_wait_all, which are supposed to
- only wait for asynchronous launches from the same host thread they are
- invoked on. If multiple threads use the same async value, we make note
- of that here and fall back to testing/waiting for all threads in those
- functions. */
- if (thread != stream->host_thread)
- stream->multithreaded = true;
-
- pthread_mutex_unlock (&ptx_dev->stream_lock);
- }
- else if (stream && !stream->multithreaded
- && !pthread_equal (stream->host_thread, thread))
- GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
-
- return stream;
-}
-
/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
should be locked on entry and remains locked on exit. */
@@ -677,9 +322,6 @@ nvptx_init (void)
if (instantiated_devices != 0)
return true;
- ptx_events = NULL;
- pthread_mutex_init (&ptx_event_lock, NULL);
-
if (!init_cuda_lib ())
return false;
@@ -703,6 +345,11 @@ nvptx_attach_host_thread_to_device (int n)
CUcontext thd_ctx;
r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
+ if (r == CUDA_ERROR_NOT_PERMITTED)
+ {
+ /* Assume we're in a CUDA callback, just return true. */
+ return true;
+ }
if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
{
GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
@@ -847,8 +494,8 @@ nvptx_open_device (int n)
ptx_dev->images = NULL;
pthread_mutex_init (&ptx_dev->image_lock, NULL);
- if (!init_streams_for_device (ptx_dev, async_engines))
- return NULL;
+ ptx_dev->free_blocks = NULL;
+ pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
return ptx_dev;
}
@@ -859,9 +506,15 @@ nvptx_close_device (struct ptx_device *ptx_dev)
if (!ptx_dev)
return true;
- if (!fini_streams_for_device (ptx_dev))
- return false;
-
+ for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
+ {
+ struct ptx_free_block *b_next = b->next;
+ CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
+ free (b);
+ b = b_next;
+ }
+
+ pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
pthread_mutex_destroy (&ptx_dev->image_lock);
if (!ptx_dev->ctx_shared)
@@ -1041,139 +694,19 @@ link_ptx (CUmodule *module, const struct targ_ptx_
}
static void
-event_gc (bool memmap_lockable)
-{
- struct ptx_event *ptx_event = ptx_events;
- struct ptx_event *async_cleanups = NULL;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- pthread_mutex_lock (&ptx_event_lock);
-
- while (ptx_event != NULL)
- {
- CUresult r;
- struct ptx_event *e = ptx_event;
-
- ptx_event = ptx_event->next;
-
- if (e->ord != nvthd->ptx_dev->ord)
- continue;
-
- r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
- if (r == CUDA_SUCCESS)
- {
- bool append_async = false;
- CUevent *te;
-
- te = e->evt;
-
- switch (e->type)
- {
- case PTX_EVT_MEM:
- case PTX_EVT_SYNC:
- break;
-
- case PTX_EVT_KNL:
- map_pop (e->addr);
- break;
-
- case PTX_EVT_ASYNC_CLEANUP:
- {
- /* The function gomp_plugin_async_unmap_vars needs to claim the
- memory-map splay tree lock for the current device, so we
- can't call it when one of our callers has already claimed
- the lock. In that case, just delay the GC for this event
- until later. */
- if (!memmap_lockable)
- continue;
-
- append_async = true;
- }
- break;
- }
-
- CUDA_CALL_NOCHECK (cuEventDestroy, *te);
- free ((void *)te);
-
- /* Unlink 'e' from ptx_events list. */
- if (ptx_events == e)
- ptx_events = ptx_events->next;
- else
- {
- struct ptx_event *e_ = ptx_events;
- while (e_->next != e)
- e_ = e_->next;
- e_->next = e_->next->next;
- }
-
- if (append_async)
- {
- e->next = async_cleanups;
- async_cleanups = e;
- }
- else
- free (e);
- }
- }
-
- pthread_mutex_unlock (&ptx_event_lock);
-
- /* We have to do these here, after ptx_event_lock is released. */
- while (async_cleanups)
- {
- struct ptx_event *e = async_cleanups;
- async_cleanups = async_cleanups->next;
-
- GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
- free (e);
- }
-}
-
-static void
-event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
-{
- struct ptx_event *ptx_event;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
- || type == PTX_EVT_ASYNC_CLEANUP);
-
- ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
- ptx_event->type = type;
- ptx_event->evt = e;
- ptx_event->addr = h;
- ptx_event->ord = nvthd->ptx_dev->ord;
- ptx_event->val = val;
-
- pthread_mutex_lock (&ptx_event_lock);
-
- ptx_event->next = ptx_events;
- ptx_events = ptx_event;
-
- pthread_mutex_unlock (&ptx_event_lock);
-}
-
-static void
nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
- int async, unsigned *dims, void *targ_mem_desc)
+ unsigned *dims, void *targ_mem_desc,
+ CUdeviceptr dp, CUstream stream)
{
struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
CUfunction function;
- CUresult r;
int i;
- struct ptx_stream *dev_str;
void *kargs[1];
- void *hp;
- CUdeviceptr dp = 0;
struct nvptx_thread *nvthd = nvptx_thread ();
int warp_size = nvthd->ptx_dev->warp_size;
- const char *maybe_abort_msg = "(perhaps abort was called)";
function = targ_fn->fn;
- dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
- assert (dev_str == nvthd->current_stream);
-
/* Initialize the launch dimensions. Typically this is constant,
provided by the device compiler, but we must permit runtime
values. */
@@ -1361,27 +894,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
dims[GOMP_DIM_VECTOR]);
}
- if (mapnum > 0)
- {
- /* This reserves a chunk of a pre-allocated page of memory mapped on both
- the host and the device. HP is a host pointer to the new chunk, and DP is
- the corresponding device pointer. */
- pthread_mutex_lock (&ptx_event_lock);
- dp = map_push (dev_str, mapnum * sizeof (void *));
- pthread_mutex_unlock (&ptx_event_lock);
-
- GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
-
- /* Copy the array of arguments to the mapped page. */
- hp = alloca(sizeof(void *) * mapnum);
- for (i = 0; i < mapnum; i++)
- ((void **) hp)[i] = devaddrs[i];
-
- /* Copy the (device) pointers to arguments to the device */
- CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
- mapnum * sizeof (void *));
- }
-
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
" gangs=%u, workers=%u, vectors=%u\n",
__FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
@@ -1392,62 +904,14 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
// num_gangs nctaid.x
// num_workers ntid.y
// vector length ntid.x
-
kargs[0] = &dp;
CUDA_CALL_ASSERT (cuLaunchKernel, function,
dims[GOMP_DIM_GANG], 1, 1,
dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
- 0, dev_str->stream, kargs, 0);
+ 0, stream, kargs, 0);
-#ifndef DISABLE_ASYNC
- if (async < acc_async_noval)
- {
- r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
- if (r == CUDA_ERROR_LAUNCH_FAILED)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
- maybe_abort_msg);
- else if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
- }
- else
- {
- CUevent *e;
-
- e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
- if (r == CUDA_ERROR_LAUNCH_FAILED)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
- maybe_abort_msg);
- else if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
-
- event_gc (true);
-
- CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
-
- if (mapnum > 0)
- event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
- }
-#else
- r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
- if (r == CUDA_ERROR_LAUNCH_FAILED)
- GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
- maybe_abort_msg);
- else if (r != CUDA_SUCCESS)
- GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
-#endif
-
GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
targ_fn->launch->fn);
-
-#ifndef DISABLE_ASYNC
- if (async < acc_async_noval)
-#endif
- {
- if (mapnum > 0)
- map_pop (dev_str);
- }
}
void * openacc_get_current_cuda_context (void);
@@ -1462,8 +926,21 @@ nvptx_alloc (size_t s)
}
static bool
-nvptx_free (void *p)
+nvptx_free (void *p, struct ptx_device *ptx_dev)
{
+ /* Assume callback context if this is null. */
+ if (GOMP_PLUGIN_acc_thread () == NULL)
+ {
+ struct ptx_free_block *n
+ = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
+ n->ptr = p;
+ pthread_mutex_lock (&ptx_dev->free_blocks_lock);
+ n->next = ptx_dev->free_blocks;
+ ptx_dev->free_blocks = n;
+ pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
+ return true;
+ }
+
CUdeviceptr pb;
size_t ps;
@@ -1478,305 +955,6 @@ static bool
return true;
}
-
-static bool
-nvptx_host2dev (void *d, const void *h, size_t s)
-{
- CUdeviceptr pb;
- size_t ps;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (!s)
- return true;
- if (!d)
- {
- GOMP_PLUGIN_error ("invalid device address");
- return false;
- }
-
- CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
-
- if (!pb)
- {
- GOMP_PLUGIN_error ("invalid device address");
- return false;
- }
- if (!h)
- {
- GOMP_PLUGIN_error ("invalid host address");
- return false;
- }
- if (d == h)
- {
- GOMP_PLUGIN_error ("invalid host or device address");
- return false;
- }
- if ((void *)(d + s) > (void *)(pb + ps))
- {
- GOMP_PLUGIN_error ("invalid size");
- return false;
- }
-
-#ifndef DISABLE_ASYNC
- if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
- {
- CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
- CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
- event_gc (false);
- CUDA_CALL (cuMemcpyHtoDAsync,
- (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
- CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
- event_add (PTX_EVT_MEM, e, (void *)h, 0);
- }
- else
-#endif
- CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
-
- return true;
-}
-
-static bool
-nvptx_dev2host (void *h, const void *d, size_t s)
-{
- CUdeviceptr pb;
- size_t ps;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (!s)
- return true;
- if (!d)
- {
- GOMP_PLUGIN_error ("invalid device address");
- return false;
- }
-
- CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
-
- if (!pb)
- {
- GOMP_PLUGIN_error ("invalid device address");
- return false;
- }
- if (!h)
- {
- GOMP_PLUGIN_error ("invalid host address");
- return false;
- }
- if (d == h)
- {
- GOMP_PLUGIN_error ("invalid host or device address");
- return false;
- }
- if ((void *)(d + s) > (void *)(pb + ps))
- {
- GOMP_PLUGIN_error ("invalid size");
- return false;
- }
-
-#ifndef DISABLE_ASYNC
- if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
- {
- CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
- CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
- event_gc (false);
- CUDA_CALL (cuMemcpyDtoHAsync,
- h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
- CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
- event_add (PTX_EVT_MEM, e, (void *)h, 0);
- }
- else
-#endif
- CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
-
- return true;
-}
-
-static void
-nvptx_set_async (int async)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
- nvthd->current_stream
- = select_stream_for_async (async, pthread_self (), true, NULL);
-}
-
-static int
-nvptx_async_test (int async)
-{
- CUresult r;
- struct ptx_stream *s;
-
- s = select_stream_for_async (async, pthread_self (), false, NULL);
- if (!s)
- return 1;
-
- r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
- if (r == CUDA_SUCCESS)
- {
- /* The oacc-parallel.c:goacc_wait function calls this hook to determine
- whether all work has completed on this stream, and if so omits the call
- to the wait hook. If that happens, event_gc might not get called
- (which prevents variables from getting unmapped and their associated
- device storage freed), so call it here. */
- event_gc (true);
- return 1;
- }
- else if (r == CUDA_ERROR_NOT_READY)
- return 0;
-
- GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
-
- return 0;
-}
-
-static int
-nvptx_async_test_all (void)
-{
- struct ptx_stream *s;
- pthread_t self = pthread_self ();
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
- {
- if ((s->multithreaded || pthread_equal (s->host_thread, self))
- && CUDA_CALL_NOCHECK (cuStreamQuery,
- s->stream) == CUDA_ERROR_NOT_READY)
- {
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
- return 0;
- }
- }
-
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
- event_gc (true);
-
- return 1;
-}
-
-static void
-nvptx_wait (int async)
-{
- struct ptx_stream *s;
-
- s = select_stream_for_async (async, pthread_self (), false, NULL);
- if (!s)
- return;
-
- CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
-
- event_gc (true);
-}
-
-static void
-nvptx_wait_async (int async1, int async2)
-{
- CUevent *e;
- struct ptx_stream *s1, *s2;
- pthread_t self = pthread_self ();
-
- s1 = select_stream_for_async (async1, self, false, NULL);
- if (!s1)
- return;
-
- /* The stream that is waiting (rather than being waited for) doesn't
- necessarily have to exist already. */
- s2 = select_stream_for_async (async2, self, true, NULL);
-
- /* A stream is always synchronized with itself. */
- if (s1 == s2)
- return;
-
- e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-
- event_gc (true);
-
- CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
-
- event_add (PTX_EVT_SYNC, e, NULL, 0);
-
- CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
-}
-
-static void
-nvptx_wait_all (void)
-{
- CUresult r;
- struct ptx_stream *s;
- pthread_t self = pthread_self ();
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- /* Wait for active streams initiated by this thread (or by multiple threads)
- to complete. */
- for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
- {
- if (s->multithreaded || pthread_equal (s->host_thread, self))
- {
- r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
- if (r == CUDA_SUCCESS)
- continue;
- else if (r != CUDA_ERROR_NOT_READY)
- GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
-
- CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
- }
- }
-
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
- event_gc (true);
-}
-
-static void
-nvptx_wait_all_async (int async)
-{
- struct ptx_stream *waiting_stream, *other_stream;
- CUevent *e;
- struct nvptx_thread *nvthd = nvptx_thread ();
- pthread_t self = pthread_self ();
-
- /* The stream doing the waiting. This could be the first mention of the
- stream, so create it if necessary. */
- waiting_stream
- = select_stream_for_async (async, pthread_self (), true, NULL);
-
- /* Launches on the null stream already block on other streams in the
- context. */
- if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
- return;
-
- event_gc (true);
-
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- for (other_stream = nvthd->ptx_dev->active_streams;
- other_stream != NULL;
- other_stream = other_stream->next)
- {
- if (!other_stream->multithreaded
- && !pthread_equal (other_stream->host_thread, self))
- continue;
-
- e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
-
- CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
-
- /* Record an event on the waited-for stream. */
- CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
-
- event_add (PTX_EVT_SYNC, e, NULL, 0);
-
- CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
- }
-
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-}
-
static void *
nvptx_get_current_cuda_device (void)
{
@@ -1799,75 +977,6 @@ nvptx_get_current_cuda_context (void)
return nvthd->ptx_dev->ctx;
}
-static void *
-nvptx_get_cuda_stream (int async)
-{
- struct ptx_stream *s;
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- if (!nvthd || !nvthd->ptx_dev)
- return NULL;
-
- s = select_stream_for_async (async, pthread_self (), false, NULL);
-
- return s ? s->stream : NULL;
-}
-
-static int
-nvptx_set_cuda_stream (int async, void *stream)
-{
- struct ptx_stream *oldstream;
- pthread_t self = pthread_self ();
- struct nvptx_thread *nvthd = nvptx_thread ();
-
- /* Due to the "null_stream" usage for "acc_async_sync", this cannot be used
- to change the stream handle associated with "acc_async_sync". */
- if (async == acc_async_sync)
- {
- GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated"
- " with \"acc_async_sync\"\n");
- return 0;
- }
-
- pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
-
- /* We have a list of active streams and an array mapping async values to
- entries of that list. We need to take "ownership" of the passed-in stream,
- and add it to our list, removing the previous entry also (if there was one)
- in order to prevent resource leaks. Note the potential for surprise
- here: maybe we should keep track of passed-in streams and leave it up to
- the user to tidy those up, but that doesn't work for stream handles
- returned from acc_get_cuda_stream above... */
-
- oldstream = select_stream_for_async (async, self, false, NULL);
-
- if (oldstream)
- {
- if (nvthd->ptx_dev->active_streams == oldstream)
- nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
- else
- {
- struct ptx_stream *s = nvthd->ptx_dev->active_streams;
- while (s->next != oldstream)
- s = s->next;
- s->next = s->next->next;
- }
-
- CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
-
- if (!map_fini (oldstream))
- GOMP_PLUGIN_fatal ("error when freeing host memory");
-
- free (oldstream);
- }
-
- pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
-
- (void) select_stream_for_async (async, self, true, (CUstream) stream);
-
- return 1;
-}
-
/* Plugin entry points. */
const char *
@@ -2107,6 +1216,23 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
{
if (!nvptx_attach_host_thread_to_device (ord))
return NULL;
+
+ struct ptx_device *ptx_dev = ptx_devices[ord];
+ struct ptx_free_block *blocks, *tmp;
+
+ pthread_mutex_lock (&ptx_dev->free_blocks_lock);
+ blocks = ptx_dev->free_blocks;
+ ptx_dev->free_blocks = NULL;
+ pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
+
+ while (blocks)
+ {
+ tmp = blocks->next;
+ nvptx_free (blocks->ptr, ptx_dev);
+ free (blocks);
+ blocks = tmp;
+ }
+
return nvptx_alloc (size);
}
@@ -2114,93 +1240,92 @@ bool
GOMP_OFFLOAD_free (int ord, void *ptr)
{
return (nvptx_attach_host_thread_to_device (ord)
- && nvptx_free (ptr));
+ && nvptx_free (ptr, ptx_devices[ord]));
}
-bool
-GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
-{
- return (nvptx_attach_host_thread_to_device (ord)
- && nvptx_dev2host (dst, src, n));
-}
-
-bool
-GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
-{
- return (nvptx_attach_host_thread_to_device (ord)
- && nvptx_host2dev (dst, src, n));
-}
-
-bool
-GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
-{
- struct ptx_device *ptx_dev = ptx_devices[ord];
- CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
- ptx_dev->null_stream->stream);
- return true;
-}
-
-void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
-
void
GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
void **hostaddrs, void **devaddrs,
- int async, unsigned *dims, void *targ_mem_desc)
+ unsigned *dims, void *targ_mem_desc)
{
- nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
-}
+ GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
-void
-GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
-{
- struct nvptx_thread *nvthd = nvptx_thread ();
- CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+ void **hp = NULL;
+ CUdeviceptr dp = 0;
- CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
- CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
- event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
-}
+ if (mapnum > 0)
+ {
+ hp = alloca (mapnum * sizeof (void *));
+ for (int i = 0; i < mapnum; i++)
+ hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
+ CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
+ }
-int
-GOMP_OFFLOAD_openacc_async_test (int async)
-{
- return nvptx_async_test (async);
-}
+ /* Copy the (device) pointers to arguments to the device (dp and hp might in
+ fact have the same value on a unified-memory system). */
+ if (mapnum > 0)
+ CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
+ mapnum * sizeof (void *));
-int
-GOMP_OFFLOAD_openacc_async_test_all (void)
-{
- return nvptx_async_test_all ();
-}
+ nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
+ dp, NULL);
-void
-GOMP_OFFLOAD_openacc_async_wait (int async)
-{
- nvptx_wait (async);
+ CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
+ const char *maybe_abort_msg = "(perhaps abort was called)";
+ if (r == CUDA_ERROR_LAUNCH_FAILED)
+ GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
+ maybe_abort_msg);
+ else if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
+ CUDA_CALL_ASSERT (cuMemFree, dp);
}
-void
-GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
+static void
+cuda_free_argmem (void *ptr)
{
- nvptx_wait_async (async1, async2);
+ void **block = (void **) ptr;
+ nvptx_free (block[0], (struct ptx_device *) block[1]);
+ free (block);
}
void
-GOMP_OFFLOAD_openacc_async_wait_all (void)
+GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
+ void **hostaddrs, void **devaddrs,
+ unsigned *dims, void *targ_mem_desc,
+ struct goacc_asyncqueue *aq)
{
- nvptx_wait_all ();
-}
+ GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
-void
-GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
-{
- nvptx_wait_all_async (async);
-}
+ void **hp = NULL;
+ CUdeviceptr dp = 0;
+ void **block = NULL;
-void
-GOMP_OFFLOAD_openacc_async_set_async (int async)
-{
- nvptx_set_async (async);
+ if (mapnum > 0)
+ {
+ block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *));
+ hp = block + 2;
+ for (int i = 0; i < mapnum; i++)
+ hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
+ CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
+ }
+
+ /* Copy the (device) pointers to arguments to the device (dp and hp might in
+ fact have the same value on a unified-memory system). */
+ if (mapnum > 0)
+ {
+ CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
+ mapnum * sizeof (void *), aq->cuda_stream);
+ block[0] = (void *) dp;
+
+ struct nvptx_thread *nvthd =
+ (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
+ block[1] = (void *) nvthd->ptx_dev;
+ }
+ nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
+ dp, aq->cuda_stream);
+
+ if (mapnum > 0)
+ GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
}
void *
@@ -2222,7 +1347,6 @@ GOMP_OFFLOAD_openacc_create_thread_data (int ord)
if (!thd_ctx)
CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
- nvthd->current_stream = ptx_dev->null_stream;
nvthd->ptx_dev = ptx_dev;
return (void *) nvthd;
@@ -2246,22 +1370,186 @@ GOMP_OFFLOAD_openacc_cuda_get_current_context (voi
return nvptx_get_current_cuda_context ();
}
-/* NOTE: This returns a CUstream, not a ptx_stream pointer. */
-
+/* This returns a CUstream. */
void *
-GOMP_OFFLOAD_openacc_cuda_get_stream (int async)
+GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
{
- return nvptx_get_cuda_stream (async);
+ return (void *) aq->cuda_stream;
}
-/* NOTE: This takes a CUstream, not a ptx_stream pointer. */
+/* This takes a CUstream. */
+int
+GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
+{
+ if (aq->cuda_stream)
+ {
+ CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
+ CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
+ }
+ aq->cuda_stream = (CUstream) stream;
+ return 1;
+}
+
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (void)
+{
+ CUstream stream = NULL;
+ CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+
+ struct goacc_asyncqueue *aq
+ = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
+ aq->cuda_stream = stream;
+ return aq;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+ CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
+ free (aq);
+ return true;
+}
+
int
-GOMP_OFFLOAD_openacc_cuda_set_stream (int async, void *stream)
+GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
{
- return nvptx_set_cuda_stream (async, stream);
+ CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
+ if (r == CUDA_SUCCESS)
+ return 1;
+ if (r == CUDA_ERROR_NOT_READY)
+ return 0;
+
+ GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
+ return -1;
}
+bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+ CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
+ struct goacc_asyncqueue *aq2)
+{
+ CUevent e;
+ CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
+ CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
+ CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
+ return true;
+}
+
+static void
+cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
+{
+ if (res != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
+ struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
+ cb->fn (cb->ptr);
+ free (ptr);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
+ void (*callback_fn)(void *),
+ void *userptr)
+{
+ struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
+ b->fn = callback_fn;
+ b->ptr = userptr;
+ b->aq = aq;
+ CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
+ cuda_callback_wrapper, (void *) b, 0);
+}
+
+static bool
+cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
+{
+ CUdeviceptr pb;
+ size_t ps;
+ if (!s)
+ return true;
+ if (!d)
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
+ CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
+ if (!pb)
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
+ if (!h)
+ {
+ GOMP_PLUGIN_error ("invalid host address");
+ return false;
+ }
+ if (d == h)
+ {
+ GOMP_PLUGIN_error ("invalid host or device address");
+ return false;
+ }
+ if ((void *)(d + s) > (void *)(pb + ps))
+ {
+ GOMP_PLUGIN_error ("invalid size");
+ return false;
+ }
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
+{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_sanity_check (src, dst, n))
+ return false;
+ CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
+{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_sanity_check (dst, src, n))
+ return false;
+ CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
+{
+ CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
+ size_t n, struct goacc_asyncqueue *aq)
+{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_sanity_check (src, dst, n))
+ return false;
+ CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
+ return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
+ size_t n, struct goacc_asyncqueue *aq)
+{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_sanity_check (dst, src, n))
+ return false;
+ CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
+ return true;
+}
+
/* Adjust launch dimensions: pick good values for number of blocks and warps
and ensure that number of warps does not exceed CUDA limits as well as GCC's
own limits. */
@@ -2360,8 +1648,7 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt
CU_LAUNCH_PARAM_END
};
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
- 32, threads, 1, 0, ptx_dev->null_stream->stream,
- NULL, config);
+ 32, threads, 1, 0, NULL, NULL, config);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
Index: libgomp/plugin/cuda-lib.def
===================================================================
--- libgomp/plugin/cuda-lib.def (revision 269183)
+++ libgomp/plugin/cuda-lib.def (working copy)
@@ -42,6 +42,7 @@ CUDA_ONE_CALL (cuModuleLoad)
CUDA_ONE_CALL (cuModuleLoadData)
CUDA_ONE_CALL (cuModuleUnload)
CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
+CUDA_ONE_CALL (cuStreamAddCallback)
CUDA_ONE_CALL (cuStreamCreate)
CUDA_ONE_CALL (cuStreamDestroy)
CUDA_ONE_CALL (cuStreamQuery)
Index: libgomp/oacc-mem.c
===================================================================
--- libgomp/oacc-mem.c (revision 269183)
+++ libgomp/oacc-mem.c (working copy)
@@ -172,18 +172,11 @@ memcpy_tofrom_device (bool from, void *d, void *h,
return;
}
- if (async > acc_async_sync)
- thr->dev->openacc.async_set_async_func (async);
-
- bool ret = (from
- ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s)
- : thr->dev->host2dev_func (thr->dev->target_id, d, h, s));
-
- if (async > acc_async_sync)
- thr->dev->openacc.async_set_async_func (acc_async_sync);
-
- if (!ret)
- gomp_fatal ("error in %s", libfnname);
+ goacc_aq aq = get_goacc_asyncqueue (async);
+ if (from)
+ gomp_copy_dev2host (thr->dev, aq, h, d, s);
+ else
+ gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
}
void
@@ -509,17 +502,13 @@ present_create_copy (unsigned f, void *h, size_t s
gomp_mutex_unlock (&acc_dev->lock);
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (async);
+ goacc_aq aq = get_goacc_asyncqueue (async);
- tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
- GOMP_MAP_VARS_OPENACC);
+ tgt = gomp_map_vars_async (acc_dev, aq, mapnum, &hostaddrs, NULL, &s,
+ &kinds, true, GOMP_MAP_VARS_OPENACC);
/* Initialize dynamic refcount. */
tgt->list[0].key->dynamic_refcount = 1;
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (acc_async_sync);
-
gomp_mutex_lock (&acc_dev->lock);
d = tgt->to_free;
@@ -676,13 +665,9 @@ delete_copyout (unsigned f, void *h, size_t s, int
if (f & FLAG_COPYOUT)
{
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (async);
- acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (acc_async_sync);
+ goacc_aq aq = get_goacc_asyncqueue (async);
+ gomp_copy_dev2host (acc_dev, aq, h, d, s);
}
-
gomp_remove_var (acc_dev, n);
}
@@ -765,17 +750,13 @@ update_dev_host (int is_dev, void *h, size_t s, in
d = (void *) (n->tgt->tgt_start + n->tgt_offset
+ (uintptr_t) h - n->host_start);
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (async);
+ goacc_aq aq = get_goacc_asyncqueue (async);
if (is_dev)
- acc_dev->host2dev_func (acc_dev->target_id, d, h, s);
+ gomp_copy_host2dev (acc_dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
else
- acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
+ gomp_copy_dev2host (acc_dev, aq, h, d, s);
- if (async > acc_async_sync)
- acc_dev->openacc.async_set_async_func (acc_async_sync);
-
gomp_mutex_unlock (&acc_dev->lock);
}
@@ -805,7 +786,7 @@ acc_update_self_async (void *h, size_t s, int asyn
void
gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
- void *kinds)
+ void *kinds, int async)
{
struct target_mem_desc *tgt;
struct goacc_thread *thr = goacc_thread ();
@@ -835,8 +816,9 @@ gomp_acc_insert_pointer (size_t mapnum, void **hos
}
gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__);
- tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
- NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
+ goacc_aq aq = get_goacc_asyncqueue (async);
+ tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs,
+ NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__);
/* Initialize dynamic refcount. */
@@ -930,7 +912,10 @@ gomp_acc_remove_pointer (void *h, size_t s, bool f
if (async < acc_async_noval)
gomp_unmap_vars (t, true);
else
- t->device_descr->openacc.register_async_cleanup_func (t, async);
+ {
+ goacc_aq aq = get_goacc_asyncqueue (async);
+ gomp_unmap_vars_async (t, true, aq);
+ }
}
gomp_mutex_unlock (&acc_dev->lock);
Index: libgomp/oacc-parallel.c
===================================================================
--- libgomp/oacc-parallel.c (revision 269183)
+++ libgomp/oacc-parallel.c (working copy)
@@ -217,8 +217,6 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (voi
}
va_end (ap);
- acc_dev->openacc.async_set_async_func (async);
-
if (!(acc_dev->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC))
{
k.host_start = (uintptr_t) fn;
@@ -235,44 +233,29 @@ GOACC_parallel_keyed (int flags_m, void (*fn) (voi
else
tgt_fn = (void (*)) fn;
- tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
- GOMP_MAP_VARS_OPENACC);
+ goacc_aq aq = get_goacc_asyncqueue (async);
+ tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
+ true, GOMP_MAP_VARS_OPENACC);
+
devaddrs = gomp_alloca (sizeof (void *) * mapnum);
for (i = 0; i < mapnum; i++)
devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
+ tgt->list[i].key->tgt_offset
+ tgt->list[i].offset);
-
- acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
- async, dims, tgt);
-
- /* If running synchronously, unmap immediately. */
- bool copyfrom = true;
- if (async_synchronous_p (async))
- gomp_unmap_vars (tgt, true);
+ if (aq == NULL)
+ {
+ acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+ dims, tgt);
+ /* If running synchronously, unmap immediately. */
+ gomp_unmap_vars (tgt, true);
+ }
else
{
- bool async_unmap = false;
- for (size_t i = 0; i < tgt->list_count; i++)
- {
- splay_tree_key k = tgt->list[i].key;
- if (k && k->refcount == 1)
- {
- async_unmap = true;
- break;
- }
- }
- if (async_unmap)
- tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
- else
- {
- copyfrom = false;
- gomp_unmap_vars (tgt, copyfrom);
- }
+ acc_dev->openacc.async.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+ dims, tgt, aq);
+ gomp_unmap_vars_async (tgt, true, aq);
}
-
- acc_dev->openacc.async_set_async_func (acc_async_sync);
}
/* Legacy entry point, only provide host execution. */
@@ -383,8 +366,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
finalize = true;
}
- acc_dev->openacc.async_set_async_func (async);
-
/* Determine if this is an "acc enter data". */
for (i = 0; i < mapnum; ++i)
{
@@ -437,11 +418,11 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
{
case GOMP_MAP_ALLOC:
case GOMP_MAP_FORCE_ALLOC:
- acc_create (hostaddrs[i], sizes[i]);
+ acc_create_async (hostaddrs[i], sizes[i], async);
break;
case GOMP_MAP_TO:
case GOMP_MAP_FORCE_TO:
- acc_copyin (hostaddrs[i], sizes[i]);
+ acc_copyin_async (hostaddrs[i], sizes[i], async);
break;
default:
gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -452,7 +433,7 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
else
{
gomp_acc_insert_pointer (pointer, &hostaddrs[i],
- &sizes[i], &kinds[i]);
+ &sizes[i], &kinds[i], async);
/* Increment 'i' by two because OpenACC requires fortran
arrays to be contiguous, so each PSET is associated with
one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
@@ -477,17 +458,17 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
if (acc_is_present (hostaddrs[i], sizes[i]))
{
if (finalize)
- acc_delete_finalize (hostaddrs[i], sizes[i]);
+ acc_delete_finalize_async (hostaddrs[i], sizes[i], async);
else
- acc_delete (hostaddrs[i], sizes[i]);
+ acc_delete_async (hostaddrs[i], sizes[i], async);
}
break;
case GOMP_MAP_FROM:
case GOMP_MAP_FORCE_FROM:
if (finalize)
- acc_copyout_finalize (hostaddrs[i], sizes[i]);
+ acc_copyout_finalize_async (hostaddrs[i], sizes[i], async);
else
- acc_copyout (hostaddrs[i], sizes[i]);
+ acc_copyout_async (hostaddrs[i], sizes[i], async);
break;
default:
gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -505,8 +486,6 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
i += pointer - 1;
}
}
-
- acc_dev->openacc.async_set_async_func (acc_async_sync);
}
static void
@@ -532,9 +511,10 @@ goacc_wait (int async, int num_waits, va_list *ap)
if (async == acc_async_sync)
acc_wait (qid);
else if (qid == async)
- ;/* If we're waiting on the same asynchronous queue as we're
- launching on, the queue itself will order work as
- required, so there's no need to wait explicitly. */
+ /* If we're waiting on the same asynchronous queue as we're
+ launching on, the queue itself will order work as
+ required, so there's no need to wait explicitly. */
+ ;
else
acc_wait_async (qid, async);
}
@@ -567,8 +547,6 @@ GOACC_update (int flags_m, size_t mapnum,
va_end (ap);
}
- acc_dev->openacc.async_set_async_func (async);
-
bool update_device = false;
for (i = 0; i < mapnum; ++i)
{
@@ -591,6 +569,8 @@ GOACC_update (int flags_m, size_t mapnum,
the value of the allocated device memory in the
previous pointer. */
*(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr;
+ /* TODO: verify that we really cannot use acc_update_device_async
+ here. */
acc_update_device (hostaddrs[i], sizeof (uintptr_t));
/* Restore the host pointer. */
@@ -608,7 +588,7 @@ GOACC_update (int flags_m, size_t mapnum,
/* Fallthru */
case GOMP_MAP_FORCE_TO:
update_device = true;
- acc_update_device (hostaddrs[i], sizes[i]);
+ acc_update_device_async (hostaddrs[i], sizes[i], async);
break;
case GOMP_MAP_FROM:
@@ -620,7 +600,7 @@ GOACC_update (int flags_m, size_t mapnum,
/* Fallthru */
case GOMP_MAP_FORCE_FROM:
update_device = false;
- acc_update_self (hostaddrs[i], sizes[i]);
+ acc_update_self_async (hostaddrs[i], sizes[i], async);
break;
default:
@@ -628,8 +608,6 @@ GOACC_update (int flags_m, size_t mapnum,
break;
}
}
-
- acc_dev->openacc.async_set_async_func (acc_async_sync);
}
void
Index: libgomp/oacc-init.c
===================================================================
--- libgomp/oacc-init.c (revision 269183)
+++ libgomp/oacc-init.c (working copy)
@@ -309,7 +309,7 @@ acc_shutdown_1 (acc_device_t d)
if (acc_dev->state == GOMP_DEVICE_INITIALIZED)
{
devices_active = true;
- ret &= acc_dev->fini_device_func (acc_dev->target_id);
+ ret &= gomp_fini_device (acc_dev);
acc_dev->state = GOMP_DEVICE_UNINITIALIZED;
}
gomp_mutex_unlock (&acc_dev->lock);
@@ -426,8 +426,6 @@ goacc_attach_host_thread_to_device (int ord)
thr->target_tls
= acc_dev->openacc.create_thread_data_func (ord);
-
- acc_dev->openacc.async_set_async_func (acc_async_sync);
}
/* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
Index: libgomp/oacc-cuda.c
===================================================================
--- libgomp/oacc-cuda.c (revision 269183)
+++ libgomp/oacc-cuda.c (working copy)
@@ -30,6 +30,7 @@
#include "config.h"
#include "libgomp.h"
#include "oacc-int.h"
+#include <assert.h>
void *
acc_get_current_cuda_device (void)
@@ -62,7 +63,11 @@ acc_get_cuda_stream (int async)
return NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
- return thr->dev->openacc.cuda.get_stream_func (async);
+ {
+ goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
+ if (aq)
+ return thr->dev->openacc.cuda.get_stream_func (aq);
+ }
return NULL;
}
@@ -79,8 +84,23 @@ acc_set_cuda_stream (int async, void *stream)
thr = goacc_thread ();
+ int ret = -1;
if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
- return thr->dev->openacc.cuda.set_stream_func (async, stream);
+ {
+ goacc_aq aq = get_goacc_asyncqueue (async);
+ /* Due to not using an asyncqueue for "acc_async_sync", this cannot be
+ used to change the CUDA stream associated with "acc_async_sync". */
+ if (!aq)
+ {
+ assert (async == acc_async_sync);
+ gomp_debug (0, "Refusing request to set CUDA stream associated"
+ " with \"acc_async_sync\"\n");
+ return 0;
+ }
+ gomp_mutex_lock (&thr->dev->openacc.async.lock);
+ ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
+ gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+ }
- return -1;
+ return ret;
}
Index: libgomp/target.c
===================================================================
--- libgomp/target.c (revision 269183)
+++ libgomp/target.c (working copy)
@@ -177,6 +177,22 @@ gomp_device_copy (struct gomp_device_descr *device
}
}
+static inline void
+goacc_device_copy_async (struct gomp_device_descr *devicep,
+ bool (*copy_func) (int, void *, const void *, size_t,
+ struct goacc_asyncqueue *),
+ const char *dst, void *dstaddr,
+ const char *src, const void *srcaddr,
+ size_t size, struct goacc_asyncqueue *aq)
+{
+ if (!copy_func (devicep->target_id, dstaddr, srcaddr, size, aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("Copying of %s object [%p..%p) to %s object [%p..%p) failed",
+ src, srcaddr, srcaddr + size, dst, dstaddr, dstaddr + size);
+ }
+}
+
/* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
host to device memory transfers. */
@@ -269,8 +285,9 @@ gomp_to_device_kind_p (int kind)
}
}
-static void
+attribute_hidden void
gomp_copy_host2dev (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq,
void *d, const void *h, size_t sz,
struct gomp_coalesce_buf *cbuf)
{
@@ -299,14 +316,23 @@ gomp_copy_host2dev (struct gomp_device_descr *devi
}
}
}
- gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
+ if (__builtin_expect (aq != NULL, 0))
+ goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
+ "dev", d, "host", h, sz, aq);
+ else
+ gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
}
-static void
+attribute_hidden void
gomp_copy_dev2host (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq,
void *h, const void *d, size_t sz)
{
- gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
+ if (__builtin_expect (aq != NULL, 0))
+ goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func,
+ "host", h, "dev", d, sz, aq);
+ else
+ gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
}
static void
@@ -324,7 +350,8 @@ gomp_free_device_memory (struct gomp_device_descr
Helper function of gomp_map_vars. */
static inline void
-gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
+gomp_map_vars_existing (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq, splay_tree_key oldn,
splay_tree_key newn, struct target_var_desc *tgt_var,
unsigned char kind, struct gomp_coalesce_buf *cbuf)
{
@@ -346,7 +373,7 @@ static inline void
}
if (GOMP_MAP_ALWAYS_TO_P (kind))
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
+ newn->host_start - oldn->host_start),
(void *) newn->host_start,
@@ -364,8 +391,8 @@ get_kind (bool short_mapkind, void *kinds, int idx
}
static void
-gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
- uintptr_t target_offset, uintptr_t bias,
+gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
+ uintptr_t host_ptr, uintptr_t target_offset, uintptr_t bias,
struct gomp_coalesce_buf *cbuf)
{
struct gomp_device_descr *devicep = tgt->device_descr;
@@ -376,7 +403,7 @@ static void
if (cur_node.host_start == (uintptr_t) NULL)
{
cur_node.tgt_offset = (uintptr_t) NULL;
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + target_offset),
(void *) &cur_node.tgt_offset,
sizeof (void *), cbuf);
@@ -398,12 +425,13 @@ static void
array section. Now subtract bias to get what we want
to initialize the pointer with. */
cur_node.tgt_offset -= bias;
- gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
+ gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
(void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
}
static void
-gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
+gomp_map_fields_existing (struct target_mem_desc *tgt,
+ struct goacc_asyncqueue *aq, splay_tree_key n,
size_t first, size_t i, void **hostaddrs,
size_t *sizes, void *kinds,
struct gomp_coalesce_buf *cbuf)
@@ -423,7 +451,7 @@ static void
&& n2->tgt == n->tgt
&& n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
{
- gomp_map_vars_existing (devicep, n2, &cur_node,
+ gomp_map_vars_existing (devicep, aq, n2, &cur_node,
&tgt->list[i], kind & typemask, cbuf);
return;
}
@@ -439,8 +467,8 @@ static void
&& n2->host_start - n->host_start
== n2->tgt_offset - n->tgt_offset)
{
- gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
- kind & typemask, cbuf);
+ gomp_map_vars_existing (devicep, aq, n2, &cur_node,
+ &tgt->list[i], kind & typemask, cbuf);
return;
}
}
@@ -451,7 +479,7 @@ static void
&& n2->tgt == n->tgt
&& n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
{
- gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
+ gomp_map_vars_existing (devicep, aq, n2, &cur_node, &tgt->list[i],
kind & typemask, cbuf);
return;
}
@@ -483,10 +511,12 @@ gomp_map_val (struct target_mem_desc *tgt, void **
return tgt->tgt_start + tgt->list[i].offset;
}
-attribute_hidden struct target_mem_desc *
-gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
- void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
- bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
+static inline __attribute__((always_inline)) struct target_mem_desc *
+gomp_map_vars_internal (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq, size_t mapnum,
+ void **hostaddrs, void **devaddrs, size_t *sizes,
+ void *kinds, bool short_mapkind,
+ enum gomp_map_vars_kind pragma_kind)
{
size_t i, tgt_align, tgt_size, not_found_cnt = 0;
bool has_firstprivate = false;
@@ -600,7 +630,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
continue;
}
for (i = first; i <= last; i++)
- gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
+ gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
sizes, kinds, NULL);
i--;
continue;
@@ -645,7 +675,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
else
n = splay_tree_lookup (mem_map, &cur_node);
if (n && n->refcount != REFCOUNT_LINK)
- gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
+ gomp_map_vars_existing (devicep, aq, n, &cur_node, &tgt->list[i],
kind & typemask, NULL);
else
{
@@ -756,7 +786,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
tgt_size = (tgt_size + align - 1) & ~(align - 1);
tgt->list[i].offset = tgt_size;
len = sizes[i];
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + tgt_size),
(void *) hostaddrs[i], len, cbufp);
tgt_size += len;
@@ -790,7 +820,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
continue;
}
for (i = first; i <= last; i++)
- gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
+ gomp_map_fields_existing (tgt, aq, n, first, i, hostaddrs,
sizes, kinds, cbufp);
i--;
continue;
@@ -810,7 +840,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1);
if (cur_node.tgt_offset)
cur_node.tgt_offset -= sizes[i];
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (n->tgt->tgt_start
+ n->tgt_offset
+ cur_node.host_start
@@ -831,7 +861,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
k->host_end = k->host_start + sizeof (void *);
splay_tree_key n = splay_tree_lookup (mem_map, k);
if (n && n->refcount != REFCOUNT_LINK)
- gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
+ gomp_map_vars_existing (devicep, aq, n, k, &tgt->list[i],
kind & typemask, cbufp);
else
{
@@ -884,18 +914,19 @@ gomp_map_val (struct target_mem_desc *tgt, void **
case GOMP_MAP_FORCE_TOFROM:
case GOMP_MAP_ALWAYS_TO:
case GOMP_MAP_ALWAYS_TOFROM:
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
k->host_end - k->host_start, cbufp);
break;
case GOMP_MAP_POINTER:
- gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
+ gomp_map_pointer (tgt, aq,
+ (uintptr_t) *(void **) k->host_start,
k->tgt_offset, sizes[i], cbufp);
break;
case GOMP_MAP_TO_PSET:
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
@@ -917,7 +948,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
tgt->list[j].always_copy_from = false;
if (k->refcount != REFCOUNT_INFINITY)
k->refcount++;
- gomp_map_pointer (tgt,
+ gomp_map_pointer (tgt, aq,
(uintptr_t) *(void **) hostaddrs[j],
k->tgt_offset
+ ((uintptr_t) hostaddrs[j]
@@ -946,7 +977,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
break;
case GOMP_MAP_FORCE_DEVICEPTR:
assert (k->host_end - k->host_start == sizeof (void *));
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
@@ -965,7 +996,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
/* We intentionally do not use coalescing here, as it's not
data allocated by the current call to this function. */
- gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
+ gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
&tgt_addr, sizeof (void *), NULL);
}
array++;
@@ -978,7 +1009,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
for (i = 0; i < mapnum; i++)
{
cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + i * sizeof (void *)),
(void *) &cur_node.tgt_offset, sizeof (void *),
cbufp);
@@ -989,7 +1020,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **
{
long c = 0;
for (c = 0; c < cbuf.chunk_cnt; ++c)
- gomp_copy_host2dev (devicep,
+ gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + cbuf.chunks[c].start),
(char *) cbuf.buf + (cbuf.chunks[c].start
- cbuf.chunks[0].start),
@@ -1012,7 +1043,27 @@ gomp_map_val (struct target_mem_desc *tgt, void **
return tgt;
}
-static void
+attribute_hidden struct target_mem_desc *
+gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
+ void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
+ bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
+{
+ return gomp_map_vars_internal (devicep, NULL, mapnum, hostaddrs, devaddrs,
+ sizes, kinds, short_mapkind, pragma_kind);
+}
+
+attribute_hidden struct target_mem_desc *
+gomp_map_vars_async (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq, size_t mapnum,
+ void **hostaddrs, void **devaddrs, size_t *sizes,
+ void *kinds, bool short_mapkind,
+ enum gomp_map_vars_kind pragma_kind)
+{
+ return gomp_map_vars_internal (devicep, aq, mapnum, hostaddrs, devaddrs,
+ sizes, kinds, short_mapkind, pragma_kind);
+}
+
+attribute_hidden void
gomp_unmap_tgt (struct target_mem_desc *tgt)
{
/* Deallocate on target the tgt->tgt_start .. tgt->tgt_end region. */
@@ -1040,12 +1091,24 @@ gomp_remove_var (struct gomp_device_descr *devicep
return is_tgt_unmapped;
}
+static void
+gomp_unref_tgt (void *ptr)
+{
+ struct target_mem_desc *tgt = (struct target_mem_desc *) ptr;
+
+ if (tgt->refcount > 1)
+ tgt->refcount--;
+ else
+ gomp_unmap_tgt (tgt);
+}
+
/* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant
variables back from device to host: if it is false, it is assumed that this
has been done already. */
-attribute_hidden void
-gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
+static inline __attribute__((always_inline)) void
+gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom,
+ struct goacc_asyncqueue *aq)
{
struct gomp_device_descr *devicep = tgt->device_descr;
@@ -1082,7 +1145,7 @@ gomp_remove_var (struct gomp_device_descr *devicep
if ((do_unmap && do_copyfrom && tgt->list[i].copy_from)
|| tgt->list[i].always_copy_from)
- gomp_copy_dev2host (devicep,
+ gomp_copy_dev2host (devicep, aq,
(void *) (k->host_start + tgt->list[i].offset),
(void *) (k->tgt->tgt_start + k->tgt_offset
+ tgt->list[i].offset),
@@ -1091,14 +1154,28 @@ gomp_remove_var (struct gomp_device_descr *devicep
gomp_remove_var (devicep, k);
}
- if (tgt->refcount > 1)
- tgt->refcount--;
+ if (aq)
+ devicep->openacc.async.queue_callback_func (aq, gomp_unref_tgt,
+ (void *) tgt);
else
- gomp_unmap_tgt (tgt);
+ gomp_unref_tgt ((void *) tgt);
gomp_mutex_unlock (&devicep->lock);
}
+attribute_hidden void
+gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
+{
+ gomp_unmap_vars_internal (tgt, do_copyfrom, NULL);
+}
+
+attribute_hidden void
+gomp_unmap_vars_async (struct target_mem_desc *tgt, bool do_copyfrom,
+ struct goacc_asyncqueue *aq)
+{
+ gomp_unmap_vars_internal (tgt, do_copyfrom, aq);
+}
+
static void
gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
size_t *sizes, void *kinds, bool short_mapkind)
@@ -1148,9 +1225,10 @@ gomp_update (struct gomp_device_descr *devicep, si
size_t size = cur_node.host_end - cur_node.host_start;
if (GOMP_MAP_COPY_TO_P (kind & typemask))
- gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
+ gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
+ NULL);
if (GOMP_MAP_COPY_FROM_P (kind & typemask))
- gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
+ gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
}
}
gomp_mutex_unlock (&devicep->lock);
@@ -1443,9 +1521,24 @@ gomp_init_device (struct gomp_device_descr *device
false);
}
+ /* Initialize OpenACC asynchronous queues. */
+ goacc_init_asyncqueues (devicep);
+
devicep->state = GOMP_DEVICE_INITIALIZED;
}
+/* This function finalizes the target device, specified by DEVICEP. DEVICEP
+ must be locked on entry, and remains locked on return. */
+
+attribute_hidden bool
+gomp_fini_device (struct gomp_device_descr *devicep)
+{
+ bool ret = goacc_fini_asyncqueues (devicep);
+ ret &= devicep->fini_device_func (devicep->target_id);
+ devicep->state = GOMP_DEVICE_FINALIZED;
+ return ret;
+}
+
attribute_hidden void
gomp_unload_device (struct gomp_device_descr *devicep)
{
@@ -1954,7 +2047,7 @@ gomp_exit_data (struct gomp_device_descr *devicep,
if ((kind == GOMP_MAP_FROM && k->refcount == 0)
|| kind == GOMP_MAP_ALWAYS_FROM)
- gomp_copy_dev2host (devicep, (void *) cur_node.host_start,
+ gomp_copy_dev2host (devicep, NULL, (void *) cur_node.host_start,
(void *) (k->tgt->tgt_start + k->tgt_offset
+ cur_node.host_start
- k->host_start),
@@ -2636,20 +2729,20 @@ gomp_load_plugin_for_device (struct gomp_device_de
if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
{
if (!DLSYM_OPT (openacc.exec, openacc_exec)
- || !DLSYM_OPT (openacc.register_async_cleanup,
- openacc_register_async_cleanup)
- || !DLSYM_OPT (openacc.async_test, openacc_async_test)
- || !DLSYM_OPT (openacc.async_test_all, openacc_async_test_all)
- || !DLSYM_OPT (openacc.async_wait, openacc_async_wait)
- || !DLSYM_OPT (openacc.async_wait_async, openacc_async_wait_async)
- || !DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all)
- || !DLSYM_OPT (openacc.async_wait_all_async,
- openacc_async_wait_all_async)
- || !DLSYM_OPT (openacc.async_set_async, openacc_async_set_async)
|| !DLSYM_OPT (openacc.create_thread_data,
openacc_create_thread_data)
|| !DLSYM_OPT (openacc.destroy_thread_data,
- openacc_destroy_thread_data))
+ openacc_destroy_thread_data)
+ || !DLSYM_OPT (openacc.async.construct, openacc_async_construct)
+ || !DLSYM_OPT (openacc.async.destruct, openacc_async_destruct)
+ || !DLSYM_OPT (openacc.async.test, openacc_async_test)
+ || !DLSYM_OPT (openacc.async.synchronize, openacc_async_synchronize)
+ || !DLSYM_OPT (openacc.async.serialize, openacc_async_serialize)
+ || !DLSYM_OPT (openacc.async.queue_callback,
+ openacc_async_queue_callback)
+ || !DLSYM_OPT (openacc.async.exec, openacc_async_exec)
+ || !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host)
+ || !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev))
{
/* Require all the OpenACC handlers if we have
GOMP_OFFLOAD_CAP_OPENACC_200. */
@@ -2700,10 +2793,7 @@ gomp_target_fini (void)
struct gomp_device_descr *devicep = &devices[i];
gomp_mutex_lock (&devicep->lock);
if (devicep->state == GOMP_DEVICE_INITIALIZED)
- {
- ret = devicep->fini_device_func (devicep->target_id);
- devicep->state = GOMP_DEVICE_FINALIZED;
- }
+ ret = gomp_fini_device (devicep);
gomp_mutex_unlock (&devicep->lock);
if (!ret)
gomp_fatal ("device finalization failed");
Index: libgomp/libgomp.h
===================================================================
--- libgomp/libgomp.h (revision 269183)
+++ libgomp/libgomp.h (working copy)
@@ -949,25 +949,32 @@ typedef struct acc_dispatch_t
/* Execute. */
__typeof (GOMP_OFFLOAD_openacc_exec) *exec_func;
- /* Async cleanup callback registration. */
- __typeof (GOMP_OFFLOAD_openacc_register_async_cleanup)
- *register_async_cleanup_func;
-
- /* Asynchronous routines. */
- __typeof (GOMP_OFFLOAD_openacc_async_test) *async_test_func;
- __typeof (GOMP_OFFLOAD_openacc_async_test_all) *async_test_all_func;
- __typeof (GOMP_OFFLOAD_openacc_async_wait) *async_wait_func;
- __typeof (GOMP_OFFLOAD_openacc_async_wait_async) *async_wait_async_func;
- __typeof (GOMP_OFFLOAD_openacc_async_wait_all) *async_wait_all_func;
- __typeof (GOMP_OFFLOAD_openacc_async_wait_all_async)
- *async_wait_all_async_func;
- __typeof (GOMP_OFFLOAD_openacc_async_set_async) *async_set_async_func;
-
/* Create/destroy TLS data. */
__typeof (GOMP_OFFLOAD_openacc_create_thread_data) *create_thread_data_func;
__typeof (GOMP_OFFLOAD_openacc_destroy_thread_data)
*destroy_thread_data_func;
+
+ struct {
+ /* Once created and put into the "active" list, asyncqueues are then never
+ destructed and removed from the "active" list, other than if the TODO
+ device is shut down. */
+ gomp_mutex_t lock;
+ int nasyncqueue;
+ struct goacc_asyncqueue **asyncqueue;
+ struct goacc_asyncqueue_list *active;
+ __typeof (GOMP_OFFLOAD_openacc_async_construct) *construct_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_destruct) *destruct_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_test) *test_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_synchronize) *synchronize_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_serialize) *serialize_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_queue_callback) *queue_callback_func;
+
+ __typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func;
+ } async;
+
/* NVIDIA target specific routines. */
struct {
__typeof (GOMP_OFFLOAD_openacc_cuda_get_current_device)
@@ -1053,17 +1060,33 @@ enum gomp_map_vars_kind
GOMP_MAP_VARS_ENTER_DATA
};
-extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
+extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *, int);
extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
unsigned short *);
+struct gomp_coalesce_buf;
+extern void gomp_copy_host2dev (struct gomp_device_descr *,
+ struct goacc_asyncqueue *, void *, const void *,
+ size_t, struct gomp_coalesce_buf *);
+extern void gomp_copy_dev2host (struct gomp_device_descr *,
+ struct goacc_asyncqueue *, void *, const void *,
+ size_t);
extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
size_t, void **, void **,
size_t *, void *, bool,
enum gomp_map_vars_kind);
+extern struct target_mem_desc *gomp_map_vars_async (struct gomp_device_descr *,
+ struct goacc_asyncqueue *,
+ size_t, void **, void **,
+ size_t *, void *, bool,
+ enum gomp_map_vars_kind);
+extern void gomp_unmap_tgt (struct target_mem_desc *);
extern void gomp_unmap_vars (struct target_mem_desc *, bool);
+extern void gomp_unmap_vars_async (struct target_mem_desc *, bool,
+ struct goacc_asyncqueue *);
extern void gomp_init_device (struct gomp_device_descr *);
+extern bool gomp_fini_device (struct gomp_device_descr *);
extern void gomp_free_memmap (struct splay_tree_s *);
extern void gomp_unload_device (struct gomp_device_descr *);
extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);
Index: libgomp/oacc-int.h
===================================================================
--- libgomp/oacc-int.h (revision 269183)
+++ libgomp/oacc-int.h (working copy)
@@ -99,6 +99,13 @@ void goacc_restore_bind (void);
void goacc_lazy_initialize (void);
void goacc_host_init (void);
+void goacc_init_asyncqueues (struct gomp_device_descr *);
+bool goacc_fini_asyncqueues (struct gomp_device_descr *);
+void goacc_async_free (struct gomp_device_descr *, struct goacc_asyncqueue *,
+ void *);
+struct goacc_asyncqueue *get_goacc_asyncqueue (int);
+struct goacc_asyncqueue *lookup_goacc_asyncqueue (struct goacc_thread *, bool,
+ int);
static inline bool
async_valid_stream_id_p (int async)
{
Index: libgomp/oacc-host.c
===================================================================
--- libgomp/oacc-host.c (revision 269183)
+++ libgomp/oacc-host.c (working copy)
@@ -140,57 +140,91 @@ host_openacc_exec (void (*fn) (void *),
size_t mapnum __attribute__ ((unused)),
void **hostaddrs,
void **devaddrs __attribute__ ((unused)),
- int async __attribute__ ((unused)),
- unsigned *dims __attribute ((unused)),
+ unsigned *dims __attribute__ ((unused)),
void *targ_mem_desc __attribute__ ((unused)))
{
fn (hostaddrs);
}
static void
-host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)),
- int async __attribute__ ((unused)))
+host_openacc_async_exec (void (*fn) (void *),
+ size_t mapnum __attribute__ ((unused)),
+ void **hostaddrs,
+ void **devaddrs __attribute__ ((unused)),
+ unsigned *dims __attribute__ ((unused)),
+ void *targ_mem_desc __attribute__ ((unused)),
+ struct goacc_asyncqueue *aq __attribute__ ((unused)))
{
+ fn (hostaddrs);
}
static int
-host_openacc_async_test (int async __attribute__ ((unused)))
+host_openacc_async_test (struct goacc_asyncqueue *aq __attribute__ ((unused)))
{
return 1;
}
-static int
-host_openacc_async_test_all (void)
+static bool
+host_openacc_async_synchronize (struct goacc_asyncqueue *aq
+ __attribute__ ((unused)))
{
- return 1;
+ return true;
}
-static void
-host_openacc_async_wait (int async __attribute__ ((unused)))
+static bool
+host_openacc_async_serialize (struct goacc_asyncqueue *aq1
+ __attribute__ ((unused)),
+ struct goacc_asyncqueue *aq2
+ __attribute__ ((unused)))
{
+ return true;
}
-static void
-host_openacc_async_wait_async (int async1 __attribute__ ((unused)),
- int async2 __attribute__ ((unused)))
+static bool
+host_openacc_async_host2dev (int ord __attribute__ ((unused)),
+ void *dst __attribute__ ((unused)),
+ const void *src __attribute__ ((unused)),
+ size_t n __attribute__ ((unused)),
+ struct goacc_asyncqueue *aq
+ __attribute__ ((unused)))
{
+ return true;
}
-static void
-host_openacc_async_wait_all (void)
+static bool
+host_openacc_async_dev2host (int ord __attribute__ ((unused)),
+ void *dst __attribute__ ((unused)),
+ const void *src __attribute__ ((unused)),
+ size_t n __attribute__ ((unused)),
+ struct goacc_asyncqueue *aq
+ __attribute__ ((unused)))
{
+ return true;
}
static void
-host_openacc_async_wait_all_async (int async __attribute__ ((unused)))
+host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
+ __attribute__ ((unused)),
+ void (*callback_fn)(void *)
+ __attribute__ ((unused)),
+ void *userptr __attribute__ ((unused)))
{
}
-static void
-host_openacc_async_set_async (int async __attribute__ ((unused)))
+static struct goacc_asyncqueue *
+host_openacc_async_construct (void)
{
+ /* Non-NULL 0xffff... value as opaque dummy. */
+ return (struct goacc_asyncqueue *) -1;
}
+static bool
+host_openacc_async_destruct (struct goacc_asyncqueue *aq
+ __attribute__ ((unused)))
+{
+ return true;
+}
+
static void *
host_openacc_create_thread_data (int ord __attribute__ ((unused)))
{
@@ -235,19 +269,21 @@ static struct gomp_device_descr host_dispatch =
.exec_func = host_openacc_exec,
- .register_async_cleanup_func = host_openacc_register_async_cleanup,
-
- .async_test_func = host_openacc_async_test,
- .async_test_all_func = host_openacc_async_test_all,
- .async_wait_func = host_openacc_async_wait,
- .async_wait_async_func = host_openacc_async_wait_async,
- .async_wait_all_func = host_openacc_async_wait_all,
- .async_wait_all_async_func = host_openacc_async_wait_all_async,
- .async_set_async_func = host_openacc_async_set_async,
-
.create_thread_data_func = host_openacc_create_thread_data,
.destroy_thread_data_func = host_openacc_destroy_thread_data,
+ .async = {
+ .construct_func = host_openacc_async_construct,
+ .destruct_func = host_openacc_async_destruct,
+ .test_func = host_openacc_async_test,
+ .synchronize_func = host_openacc_async_synchronize,
+ .serialize_func = host_openacc_async_serialize,
+ .queue_callback_func = host_openacc_async_queue_callback,
+ .exec_func = host_openacc_async_exec,
+ .dev2host_func = host_openacc_async_dev2host,
+ .host2dev_func = host_openacc_async_host2dev,
+ },
+
.cuda = {
.get_current_device_func = NULL,
.get_current_context_func = NULL,
Index: libgomp/libgomp-plugin.h
===================================================================
--- libgomp/libgomp-plugin.h (revision 269183)
+++ libgomp/libgomp-plugin.h (working copy)
@@ -53,6 +53,20 @@ enum offload_target_type
OFFLOAD_TARGET_TYPE_HSA = 7
};
+/* Opaque type to represent plugin-dependent implementation of an
+ OpenACC asynchronous queue. */
+struct goacc_asyncqueue;
+
+/* Used to keep a list of active asynchronous queues. */
+struct goacc_asyncqueue_list
+{
+ struct goacc_asyncqueue *aq;
+ struct goacc_asyncqueue_list *next;
+};
+
+typedef struct goacc_asyncqueue *goacc_aq;
+typedef struct goacc_asyncqueue_list *goacc_aq_list;
+
/* Auxiliary struct, used for transferring pairs of addresses from plugin
to libgomp. */
struct addr_pair
@@ -93,22 +107,31 @@ extern bool GOMP_OFFLOAD_dev2dev (int, void *, con
extern bool GOMP_OFFLOAD_can_run (void *);
extern void GOMP_OFFLOAD_run (int, void *, void *, void **);
extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *);
+
extern void GOMP_OFFLOAD_openacc_exec (void (*) (void *), size_t, void **,
- void **, int, unsigned *, void *);
-extern void GOMP_OFFLOAD_openacc_register_async_cleanup (void *, int);
-extern int GOMP_OFFLOAD_openacc_async_test (int);
-extern int GOMP_OFFLOAD_openacc_async_test_all (void);
-extern void GOMP_OFFLOAD_openacc_async_wait (int);
-extern void GOMP_OFFLOAD_openacc_async_wait_async (int, int);
-extern void GOMP_OFFLOAD_openacc_async_wait_all (void);
-extern void GOMP_OFFLOAD_openacc_async_wait_all_async (int);
-extern void GOMP_OFFLOAD_openacc_async_set_async (int);
+ void **, unsigned *, void *);
extern void *GOMP_OFFLOAD_openacc_create_thread_data (int);
extern void GOMP_OFFLOAD_openacc_destroy_thread_data (void *);
+extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (void);
+extern bool GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *,
+ struct goacc_asyncqueue *);
+extern void GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *,
+ void (*)(void *), void *);
+extern void GOMP_OFFLOAD_openacc_async_exec (void (*) (void *), size_t, void **,
+ void **, unsigned *, void *,
+ struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size_t,
+ struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
+ struct goacc_asyncqueue *);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
-extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (int);
-extern int GOMP_OFFLOAD_openacc_cuda_set_stream (int, void *);
+extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *,
+ void *);
#ifdef __cplusplus
}
More information about the Gcc-patches
mailing list