This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] [og9] Fix OpenACC "ephemeral" asynchronous host-to-device copies
- From: Julian Brown <julian at codesourcery dot com>
- To: <gcc-patches at gcc dot gnu dot org>
- Cc: Andrew Stubbs <ams at codesourcery dot com>, Thomas Schwinge <thomas_schwinge at mentor dot com>
- Date: Tue, 17 Sep 2019 10:21:56 -0700
- Subject: [PATCH] [og9] Fix OpenACC "ephemeral" asynchronous host-to-device copies
- Ironport-sdr: XVZMywbxntDF1M9TKfnfFZAU9rJ5Yk9NucDmuZ89FTEXV2Ev35ANzAzl8lWWE14JuqE458dC9X e+cVT88sSWDwTePfRoxTRoj5L63E8RrV2TUWD2v3gW5asmxRaYyrz2mFllF1qPUr4HShj9axZY QZSrQAQUuWKOmMAjsmBwqwiVQdgmbQA91ye/SNhyQEcvSF3tB9CPsDIw/JynviNxTCPTa2zAjt je3bfJKlZeRtzX7FeMC0FeMsru8sT7XTaM9fuehDMZz3fcglvDuo+WId6+XtK/bpXDquWPyK+2 b6M=
- Ironport-sdr: 9Kc6nTb2DtuxYqMoNY5V39FkKNcXjnPdX7pLH7qUuEM3b6I4nm+ns1XI7wjkIgg8DLgcodSNOQ 7xaUXRwrsJqJJgrke97z2f50YkFUOQOQRjfpJl5rr2P5xSoAO16URa4tSEjG2YJYA3clc0tzE7 Cog5fr6NirDm1VRKJ/I1WB+1QYDPhEsbGA1MDRCVuoKZAD1O7YLvHf4J3TjVplXhfkjNfOsYgr rw5ZYsFRIDlD/Z8eU0oCUYPrFX1h8R9llnmBqHe8tEIY4Ya0nkR2ZtYWnKGrcBOu6w+sv/47hI MXo=
This patch fixes an issue with back-to-back asynchronous compute regions
working on the same data with intervening copyout/copyins. For such
regions, there was a likelihood that asynchronous "copyin" operations
on the second region would take place before the copyout from the first
region had completed.
I'd previously thought that copying asynchronous "host" data for
host-to-device transfers immediately to a temporary buffer was always
safe, but not necessarily optimal. But that turns out to not be true
if the source data in question originates from the user program, and is
used for the output of earlier asynchronous operations.
Unfortunately I don't think there's a way of fixing this problem without
knowing where the source data for a particular host-to-device copy comes
from -- several places (e.g. in gomp_map_vars_internal) have that data
coming from a host stack location, which may be long gone by the time
the asynchronous host-to-device copy takes place.
So, this patch introduces an "ephemeral" parameter to host-to-device
copying functions -- right down to the async entry point for such
copies in the offload plugin. The parameter must be set accurately: if
it is TRUE for copies from user data as above, then stale data may be
copied to the device. If it is FALSE for host-stack originated copies,
or for heap locations that might disappear before the copy takes place,
the host-to-device copy will transfer garbage.
The patch also disables coalescing buffers for asynchronous copies in
target.c, because those too may cause stale data to be copied to the target.
Tested with offloading to AMD GCN. I will apply to the
openacc-gcc-9-branch shortly.
Julian
ChangeLog
libgomp/
* libgomp-plugin.h (GOMP_OFFLOAD_openacc_async_host2dev): Update
prototype.
* libgomp.h (gomp_copy_host2dev): Update prototype.
* oacc-host.c (host_openacc_async_host2dev): Add ephemeral parameter.
* oacc-mem.c (memcpy_tofrom_device): Update call to gomp_copy_host2dev.
(update_dev_host): Likewise.
* oacc-parallel.c (GOACC_enter_exit_data): Call async versions of
acc_attach/acc_detach/acc_detach_finalize functions.
* plugin/plugin-gcn.c (wait_for_queue_nonfull): Don't lock/unlock
aq->mutex here.
(queue_push_launch): Lock aq->mutex before calling
wait_for_queue_nonfull.
(queue_push_callback): Likewise.
(queue_push_asyncwait): Likewise.
(queue_push_placeholder): Likewise.
(GOMP_OFFLOAD_openacc_async_host2dev): Add ephemeral parameter. Copy
source data to temporary space immediately if true, and pass to
queue_push_copy.
(goacc_device_copy_async): Remove.
(gomp_copy_host2dev): Add ephemeral parameter. Update function comment.
Call async host2dev plugin hook directly.
(gomp_copy_dev2host): Call async dev2host plugin hook directly.
(gomp_map_vars_existing, gomp_map_pointer, gomp_attach_pointer,
gomp_detach_pointer): Update calls to gomp_copy_host2dev.
(gomp_map_vars_internal): Don't use coalescing buffer for asynchronous
copies. Update calls to gomp_copy_host2dev.
(gomp_update): Update calls to gomp_copy_host2dev.
* testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c (main): Fix
async-safety issue. Increase number of iterations.
* testsuite/libgomp.oacc-fortran/lib-16-2.f90: Fix async-safety issue.
---
libgomp/ChangeLog.openacc | 33 ++++++
libgomp/libgomp-plugin.h | 3 +-
libgomp/libgomp.h | 2 +-
libgomp/oacc-host.c | 1 +
libgomp/oacc-mem.c | 4 +-
libgomp/oacc-parallel.c | 10 +-
libgomp/plugin/plugin-gcn.c | 43 ++++----
libgomp/target.c | 101 +++++++++---------
.../libgomp.oacc-c-c++-common/deep-copy-10.c | 20 ++--
.../libgomp.oacc-fortran/lib-16-2.f90 | 5 +
10 files changed, 135 insertions(+), 87 deletions(-)
diff --git a/libgomp/ChangeLog.openacc b/libgomp/ChangeLog.openacc
index 5f39fae6f51..1006b8149c8 100644
--- a/libgomp/ChangeLog.openacc
+++ b/libgomp/ChangeLog.openacc
@@ -1,3 +1,36 @@
+2019-09-17 Julian Brown <julian@codesourcery.com>
+
+ * libgomp-plugin.h (GOMP_OFFLOAD_openacc_async_host2dev): Update
+ prototype.
+ * libgomp.h (gomp_copy_host2dev): Update prototype.
+ * oacc-host.c (host_openacc_async_host2dev): Add ephemeral parameter.
+ * oacc-mem.c (memcpy_tofrom_device): Update call to gomp_copy_host2dev.
+ (update_dev_host): Likewise.
+ * oacc-parallel.c (GOACC_enter_exit_data): Call async versions of
+ acc_attach/acc_detach/acc_detach_finalize functions.
+ * plugin/plugin-gcn.c (wait_for_queue_nonfull): Don't lock/unlock
+ aq->mutex here.
+ (queue_push_launch): Lock aq->mutex before calling
+ wait_for_queue_nonfull.
+ (queue_push_callback): Likewise.
+ (queue_push_asyncwait): Likewise.
+ (queue_push_placeholder): Likewise.
+ (GOMP_OFFLOAD_openacc_async_host2dev): Add ephemeral parameter. Copy
+ source data to temporary space immediately if true, and pass to
+ queue_push_copy.
+ (goacc_device_copy_async): Remove.
+ (gomp_copy_host2dev): Add ephemeral parameter. Update function comment.
+ Call async host2dev plugin hook directly.
+ (gomp_copy_dev2host): Call async dev2host plugin hook directly.
+ (gomp_map_vars_existing, gomp_map_pointer, gomp_attach_pointer,
+ gomp_detach_pointer): Update calls to gomp_copy_host2dev.
+ (gomp_map_vars_internal): Don't use coalescing buffer for asynchronous
+ copies. Update calls to gomp_copy_host2dev.
+ (gomp_update): Update calls to gomp_copy_host2dev.
+ * testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c (main): Fix
+ async-safety issue. Increase number of iterations.
+ * testsuite/libgomp.oacc-fortran/lib-16-2.f90: Fix async-safety issue.
+
2019-09-17 Julian Brown <julian@codesourcery.com>
* oacc-host.c (host_openacc_async_queue_callback): Invoke callback
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index bd63c422b0c..fcd47279332 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -141,7 +141,8 @@ extern void GOMP_OFFLOAD_openacc_async_exec_params (void (*) (void *), size_t,
struct goacc_asyncqueue *);
extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size_t,
struct goacc_asyncqueue *);
-extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
+extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *,
+ size_t, bool,
struct goacc_asyncqueue *);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 803f72db922..ab216a31206 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1120,7 +1120,7 @@ extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
struct gomp_coalesce_buf;
extern void gomp_copy_host2dev (struct gomp_device_descr *,
struct goacc_asyncqueue *, void *, const void *,
- size_t, struct gomp_coalesce_buf *);
+ size_t, bool, struct gomp_coalesce_buf *);
extern void gomp_copy_dev2host (struct gomp_device_descr *,
struct goacc_asyncqueue *, void *, const void *,
size_t);
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index 0231b597114..4bc2eeb3c53 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -230,6 +230,7 @@ host_openacc_async_host2dev (int ord __attribute__ ((unused)),
void *dst __attribute__ ((unused)),
const void *src __attribute__ ((unused)),
size_t n __attribute__ ((unused)),
+ bool eph __attribute__ ((unused)),
struct goacc_asyncqueue *aq
__attribute__ ((unused)))
{
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index c07a5eb42a7..f8c71bf04c5 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -203,7 +203,7 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
if (from)
gomp_copy_dev2host (thr->dev, aq, h, d, s);
else
- gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
+ gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
if (profiling_p)
{
@@ -819,7 +819,7 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
goacc_aq aq = get_goacc_asyncqueue (async);
if (is_dev)
- gomp_copy_host2dev (acc_dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
+ gomp_copy_host2dev (acc_dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
else
gomp_copy_dev2host (acc_dev, aq, h, d, s);
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 0c9cb3c461c..a3ec0ed2adf 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -1022,7 +1022,7 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
if (!pointer)
{
if (kind == GOMP_MAP_ATTACH)
- acc_attach (hostaddrs[i]);
+ acc_attach_async (hostaddrs[i], async);
else if (kind == GOMP_MAP_STRUCT)
i += sizes[i];
}
@@ -1042,9 +1042,9 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
if (!pointer)
{
if (kind == GOMP_MAP_DETACH)
- acc_detach (hostaddrs[i]);
+ acc_detach_async (hostaddrs[i], async);
else if (kind == GOMP_MAP_FORCE_DETACH)
- acc_detach_finalize (hostaddrs[i]);
+ acc_detach_finalize_async (hostaddrs[i], async);
else if (kind == GOMP_MAP_STRUCT)
i += sizes[i];
}
@@ -1053,9 +1053,9 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
unsigned char kind2 = kinds[i + pointer - 1] & 0xff;
if (kind2 == GOMP_MAP_DETACH)
- acc_detach (hostaddrs[i + pointer - 1]);
+ acc_detach_async (hostaddrs[i + pointer - 1], async);
else if (kind2 == GOMP_MAP_FORCE_DETACH)
- acc_detach_finalize (hostaddrs[i + pointer - 1]);
+ acc_detach_finalize_async (hostaddrs[i + pointer - 1], async);
i += pointer - 1;
}
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index b8ec96391f7..b5995af0a06 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -1408,13 +1408,9 @@ wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
{
if (aq->queue_n == ASYNC_QUEUE_SIZE)
{
- pthread_mutex_lock (&aq->mutex);
-
/* Queue is full. Wait for it to not be full. */
while (aq->queue_n == ASYNC_QUEUE_SIZE)
pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
-
- pthread_mutex_unlock (&aq->mutex);
}
}
@@ -1424,10 +1420,10 @@ queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
{
assert (aq->agent == kernel->agent);
- wait_for_queue_nonfull (aq);
-
pthread_mutex_lock (&aq->mutex);
+ wait_for_queue_nonfull (aq);
+
int queue_last = ((aq->queue_first + aq->queue_n)
% ASYNC_QUEUE_SIZE);
if (DEBUG_QUEUES)
@@ -1453,10 +1449,10 @@ static void
queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
void *data)
{
- wait_for_queue_nonfull (aq);
-
pthread_mutex_lock (&aq->mutex);
+ wait_for_queue_nonfull (aq);
+
int queue_last = ((aq->queue_first + aq->queue_n)
% ASYNC_QUEUE_SIZE);
if (DEBUG_QUEUES)
@@ -1484,10 +1480,10 @@ static void
queue_push_asyncwait (struct goacc_asyncqueue *aq,
struct placeholder *placeholderp)
{
- wait_for_queue_nonfull (aq);
-
pthread_mutex_lock (&aq->mutex);
+ wait_for_queue_nonfull (aq);
+
int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
if (DEBUG_QUEUES)
HSA_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
@@ -1511,10 +1507,10 @@ queue_push_placeholder (struct goacc_asyncqueue *aq)
{
struct placeholder *placeholderp;
- wait_for_queue_nonfull (aq);
-
pthread_mutex_lock (&aq->mutex);
+ wait_for_queue_nonfull (aq);
+
int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
if (DEBUG_QUEUES)
HSA_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
@@ -3683,19 +3679,22 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
bool
GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
- size_t n, struct goacc_asyncqueue *aq)
+ size_t n, bool ephemeral,
+ struct goacc_asyncqueue *aq)
{
struct agent_info *agent = get_agent_info (device);
assert (agent == aq->agent);
- /* The source data does not necessarily remain live until the deferred
- copy happens. Taking a snapshot of the data here avoids reading
- uninitialised data later, but means that (a) data is copied twice and
- (b) modifications to the copied data between the "spawning" point of
- the asynchronous kernel and when it is executed will not be seen.
- But, that is probably correct. */
- void *src_copy = GOMP_PLUGIN_malloc (n);
- memcpy (src_copy, src, n);
- queue_push_copy (aq, dst, src_copy, n, true);
+
+ if (ephemeral)
+ {
+ /* The source data is on the stack or otherwise may be deallocated
+ before the asynchronous copy takes place. Take a copy of the source
+ data. */
+ void *src_copy = GOMP_PLUGIN_malloc (n);
+ memcpy (src_copy, src, n);
+ src = src_copy;
+ }
+ queue_push_copy (aq, dst, src, n, ephemeral);
return true;
}
diff --git a/libgomp/target.c b/libgomp/target.c
index 0656df19613..683a42b1164 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -194,22 +194,6 @@ gomp_device_copy (struct gomp_device_descr *devicep,
}
}
-static inline void
-goacc_device_copy_async (struct gomp_device_descr *devicep,
- bool (*copy_func) (int, void *, const void *, size_t,
- struct goacc_asyncqueue *),
- const char *dst, void *dstaddr,
- const char *src, const void *srcaddr,
- size_t size, struct goacc_asyncqueue *aq)
-{
- if (!copy_func (devicep->target_id, dstaddr, srcaddr, size, aq))
- {
- gomp_mutex_unlock (&devicep->lock);
- gomp_fatal ("Copying of %s object [%p..%p) to %s object [%p..%p) failed",
- src, srcaddr, srcaddr + size, dst, dstaddr, dstaddr + size);
- }
-}
-
/* Infrastructure for coalescing adjacent or nearly adjacent (in device
addresses) host to device memory transfers. */
@@ -303,15 +287,17 @@ gomp_to_device_kind_p (int kind)
}
/* Copy host memory to an offload device. In asynchronous mode (if AQ is
- non-NULL), H may point to a stack location. It is up to the underlying
- plugin to ensure that this data is read immediately, rather than at some
- later point when the stack frame will likely have been destroyed. */
+ non-NULL), when the source data is stack or may otherwise be deallocated
+ before the asynchronous copy takes place, EPHEMERAL must be passed as
+ TRUE. The CBUF isn't used for non-ephemeral asynchronous copies, because
+ the host data might not be computed yet (by an earlier asynchronous compute
+ region). */
attribute_hidden void
gomp_copy_host2dev (struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq,
void *d, const void *h, size_t sz,
- struct gomp_coalesce_buf *cbuf)
+ bool ephemeral, struct gomp_coalesce_buf *cbuf)
{
if (cbuf)
{
@@ -339,8 +325,15 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
}
}
if (__builtin_expect (aq != NULL, 0))
- goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
- "dev", d, "host", h, sz, aq);
+ {
+ if (!devicep->openacc.async.host2dev_func (devicep->target_id, d, h, sz,
+ ephemeral, aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("Copying of host object [%p..%p) to dev object [%p..%p) "
+ "failed", h, h + sz, d, d + sz);
+ }
+ }
else
gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
}
@@ -351,8 +344,15 @@ gomp_copy_dev2host (struct gomp_device_descr *devicep,
void *h, const void *d, size_t sz)
{
if (__builtin_expect (aq != NULL, 0))
- goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func,
- "host", h, "dev", d, sz, aq);
+ {
+ if (!devicep->openacc.async.dev2host_func (devicep->target_id, h, d, sz,
+ aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("Copying of dev object [%p..%p) to host object [%p..%p) "
+ "failed", d, d + sz, h, h + sz);
+ }
+ }
else
gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
}
@@ -579,7 +579,7 @@ gomp_map_vars_existing (struct gomp_device_descr *devicep,
(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
+ newn->host_start - oldn->host_start),
(void *) newn->host_start,
- newn->host_end - newn->host_start, cbuf);
+ newn->host_end - newn->host_start, false, cbuf);
if (oldn->refcount != REFCOUNT_INFINITY)
oldn->refcount++;
@@ -607,8 +607,8 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
cur_node.tgt_offset = (uintptr_t) NULL;
gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + target_offset),
- (void *) &cur_node.tgt_offset, sizeof (void *),
- cbuf);
+ (void *) &cur_node.tgt_offset,
+ sizeof (void *), true, cbuf);
return;
}
/* Add bias to the pointer value. */
@@ -628,7 +628,8 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
to initialize the pointer with. */
cur_node.tgt_offset -= bias;
gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
- (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
+ (void *) &cur_node.tgt_offset, sizeof (void *), true,
+ cbuf);
}
static void
@@ -760,7 +761,7 @@ gomp_attach_pointer (struct gomp_device_descr *devicep,
(void *) (n->tgt->tgt_start + n->tgt_offset), (void *) data);
gomp_copy_host2dev (devicep, aq, (void *) devptr, (void *) &data,
- sizeof (void *), cbufp);
+ sizeof (void *), true, cbufp);
}
else
gomp_debug (1, "%s: attach count for %p -> %u\n", __FUNCTION__,
@@ -815,7 +816,7 @@ gomp_detach_pointer (struct gomp_device_descr *devicep,
(void *) target);
gomp_copy_host2dev (devicep, aq, (void *) devptr, (void *) &target,
- sizeof (void *), cbufp);
+ sizeof (void *), true, cbufp);
}
else
gomp_debug (1, "%s: attach count for %p -> %u\n", __FUNCTION__,
@@ -1147,8 +1148,9 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
for (i = first; i <= last; i++)
{
tgt->list[i].key = NULL;
- if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i)
- & typemask))
+ if (!aq
+ && gomp_to_device_kind_p (get_kind (short_mapkind, kinds,
+ i) & typemask))
gomp_coalesce_buf_add (&cbuf,
tgt_size - cur_node.host_end
+ (uintptr_t) hostaddrs[i],
@@ -1209,8 +1211,9 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
if (tgt_align < align)
tgt_align = align;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
- gomp_coalesce_buf_add (&cbuf, tgt_size,
- cur_node.host_end - cur_node.host_start);
+ if (!aq)
+ gomp_coalesce_buf_add (&cbuf, tgt_size,
+ cur_node.host_end - cur_node.host_start);
tgt_size += cur_node.host_end - cur_node.host_start;
has_firstprivate = true;
continue;
@@ -1240,7 +1243,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
if (tgt_align < align)
tgt_align = align;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
- if (gomp_to_device_kind_p (kind & typemask))
+ if (!aq && gomp_to_device_kind_p (kind & typemask))
gomp_coalesce_buf_add (&cbuf, tgt_size,
cur_node.host_end - cur_node.host_start);
tgt_size += cur_node.host_end - cur_node.host_start;
@@ -1395,7 +1398,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
len = sizes[i];
gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + tgt_size),
- (void *) hostaddrs[i], len, cbufp);
+ (void *) hostaddrs[i], len, false, cbufp);
tgt_size += len;
continue;
case GOMP_MAP_FIRSTPRIVATE_INT:
@@ -1448,12 +1451,11 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
if (cur_node.tgt_offset)
cur_node.tgt_offset -= sizes[i];
gomp_copy_host2dev (devicep, aq,
- (void *) (n->tgt->tgt_start
- + n->tgt_offset
+ (void *) (n->tgt->tgt_start + n->tgt_offset
+ cur_node.host_start
- n->host_start),
(void *) &cur_node.tgt_offset,
- sizeof (void *), cbufp);
+ sizeof (void *), true, cbufp);
cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
+ cur_node.host_start - n->host_start;
continue;
@@ -1612,7 +1614,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
- k->host_end - k->host_start, cbufp);
+ k->host_end - k->host_start, false,
+ cbufp);
break;
case GOMP_MAP_POINTER:
gomp_map_pointer (tgt, aq,
@@ -1624,7 +1627,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
- k->host_end - k->host_start, cbufp);
+ k->host_end - k->host_start, false,
+ cbufp);
for (j = i + 1; j < mapnum; j++)
if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
@@ -1676,7 +1680,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
- sizeof (void *), cbufp);
+ sizeof (void *), false, cbufp);
break;
default:
gomp_mutex_unlock (&devicep->lock);
@@ -1692,7 +1696,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
/* We intentionally do not use coalescing here, as it's not
data allocated by the current call to this function. */
gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
- &tgt_addr, sizeof (void *), NULL);
+ &tgt_addr, sizeof (void *), true, NULL);
}
array++;
}
@@ -1779,7 +1783,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
(void *) tgt->tgt_start
+ k->tgt_offset,
(void *) k->host_start,
- da->data_row_size, cbufp);
+ da->data_row_size, false, cbufp);
array++;
}
target_data_rows[row_start + j] = (void *) target_row_addr;
@@ -1793,7 +1797,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
void *ptrblock = gomp_dynamic_array_create_ptrblock
(da, target_ptrblock, target_data_rows + row_start);
gomp_copy_host2dev (devicep, aq, target_ptrblock, ptrblock,
- da->ptrblock_size, cbufp);
+ da->ptrblock_size, true, cbufp);
free (ptrblock);
}
@@ -1817,7 +1821,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
gomp_copy_host2dev (devicep, aq,
(void *) (tgt->tgt_start + i * sizeof (void *)),
(void *) &cur_node.tgt_offset, sizeof (void *),
- cbufp);
+ true, cbufp);
}
}
@@ -1829,7 +1833,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
(void *) (tgt->tgt_start + cbuf.chunks[c].start),
(char *) cbuf.buf + (cbuf.chunks[c].start
- cbuf.chunks[0].start),
- cbuf.chunks[c].end - cbuf.chunks[c].start, NULL);
+ cbuf.chunks[c].end - cbuf.chunks[c].start, true,
+ NULL);
free (cbuf.buf);
cbuf.buf = NULL;
cbufp = NULL;
@@ -2099,7 +2104,7 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
if (GOMP_MAP_COPY_TO_P (kind & typemask))
gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size,
- NULL);
+ false, NULL);
if (GOMP_MAP_COPY_FROM_P (kind & typemask))
gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size);
}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c
index 37cde4ef059..0bc43e9477f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/deep-copy-10.c
@@ -1,6 +1,10 @@
#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
-/* Test asyncronous attach and detach operation. */
+#define ITERATIONS 1023
+
+/* Test asynchronous attach and detach operation. */
typedef struct {
int *a;
@@ -25,13 +29,13 @@ main (int argc, char* argv[])
#pragma acc enter data copyin(m)
- for (int i = 0; i < 99; i++)
+ for (int i = 0; i < ITERATIONS; i++)
{
int j;
-#pragma acc parallel loop copy(m.a[0:N]) async(i % 2)
+#pragma acc parallel loop copy(m.a[0:N]) async(0)
for (j = 0; j < N; j++)
m.a[j]++;
-#pragma acc parallel loop copy(m.b[0:N]) async((i + 1) % 2)
+#pragma acc parallel loop copy(m.b[0:N]) async(1)
for (j = 0; j < N; j++)
m.b[j]++;
}
@@ -40,10 +44,10 @@ main (int argc, char* argv[])
for (i = 0; i < N; i++)
{
- if (m.a[i] != 99)
- abort ();
- if (m.b[i] != 99)
- abort ();
+ if (m.a[i] != ITERATIONS)
+ abort ();
+ if (m.b[i] != ITERATIONS)
+ abort ();
}
free (m.a);
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90
index fa76f65912f..94b80d07f4f 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/lib-16-2.f90
@@ -27,6 +27,9 @@ program main
if (acc_is_present (h) .neqv. .TRUE.) call abort
+ ! We must wait for the update to be done.
+ call acc_wait (async)
+
h(:) = 0
call acc_copyout_async (h, sizeof (h), async)
@@ -45,6 +48,8 @@ program main
if (acc_is_present (h) .neqv. .TRUE.) call abort
+ call acc_wait (async)
+
do i = 1, N
if (h(i) /= i + i) call abort
end do
--
2.22.0