This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[PATCH] OpenACC reference count consistency checking
- From: Julian Brown <julian at codesourcery dot com>
- To: <gcc-patches at gcc dot gnu dot org>
- Cc: Chung-Lin Tang <chunglin_tang at mentor dot com>, Thomas Schwinge <thomas at codesourcery dot com>, Jakub Jelinek <jakub at redhat dot com>, <Catherine_Moore at mentor dot com>
- Date: Fri, 30 Nov 2018 03:50:24 -0800
- Subject: [PATCH] OpenACC reference count consistency checking
This is a trunk-compatible version of the patch posted here:
https://gcc.gnu.org/ml/gcc-patches/2018-11/msg02365.html
I understand it may not be suitable for committing (especially not
outside stage 1 -- though it's "obviously harmless" in its dormant state),
but it might be helpful for review purposes for the main attach/detach
patch, i.e.:
https://gcc.gnu.org/ml/gcc-patches/2018-11/msg02556.html
For convenience, I will copy the blurb from the og8 submission of the
patch here.
[...] The model used for checking is as follows.
1. Each splay tree key that references a target memory descriptor
increases that descriptor's refcount by 1.
2. Each variable listed in a target memory descriptor that links back to a
splay tree key increases that key's refcount by 1. Each target memory
descriptor's variable list is counted only once, even if multiple
splay tree keys point to it (via their "tgt" field).
3. Additional ("real") target memory descriptors may be present
representing data mapped through "acc data" or "acc parallel/kernels"
blocks. These descriptors have their refcount bumped, and the
variables linked through such blocks have their refcounts bumped also
(again, with "once only" semantics).
4. Asynchronous operations "artificially" bump the reference counts for
referenced target memory descriptors (but *not* for linked
variables/splay tree keys), in order to delay freeing mapped device
memory until the asynchronous operation has completed. We model this,
for checking purposes only, using an off-side linked list.
5. "Virtual" reference counts ("virtual_refcount") cannot be checked
purely statically, so we add the incoming value to each key's
statically-determined reference count ("refcount_chk"), and make
sure that the total matches the incoming reference count ("refcount").
Thanks,
Julian
ChangeLog
libgomp/
* libgomp.h (RC_CHECKING): New macro, disabled by default, guarding all
hunks in this patch.
(target_mem_desc): Add forward declaration.
(async_tgt_use): New struct.
(target_mem_desc): Add refcount_chk, mark fields.
(acc_dispatch_t): Add tgt_uses, au_lock fields.
(dump_tgt, gomp_rc_check): Add prototypes.
* oacc-async (goacc_async_unmap_tgt): Add refcount self-check code.
(goacc_async_copyout_unmap_vars): Likewise.
(goacc_remove_var_async): Likewise.
* oacc-parallel.c (GOACC_parallel_keyed_internal): Add refcount
self-check code.
(GOACC_data_start, GOACC_data_end, GOACC_enter_exit_data): Likewise.
* target.c (stdio.h): Include.
(dump_tgt, rc_check_clear, rc_check_count, rc_check_verify)
(gomp_rc_check): New functions to consistency-check reference counts.
(gomp_target_init): Initialise self-check-related device fields.
---
libgomp/libgomp.h | 31 +++++++
libgomp/oacc-async.c | 46 +++++++++++
libgomp/oacc-parallel.c | 33 ++++++++
libgomp/target.c | 199 +++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 309 insertions(+), 0 deletions(-)
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index df49c1b..24cbddd 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -874,9 +874,26 @@ struct target_var_desc {
uintptr_t length;
};
+/* Uncomment to enable reference-count consistency checking (for development
+ use only). */
+/*#define RC_CHECKING 1*/
+
+#ifdef RC_CHECKING
+struct target_mem_desc;
+
+struct async_tgt_use {
+ struct target_mem_desc *tgt;
+ struct async_tgt_use *next;
+};
+#endif
+
struct target_mem_desc {
/* Reference count. */
uintptr_t refcount;
+#ifdef RC_CHECKING
+ uintptr_t refcount_chk;
+ bool mark;
+#endif
/* All the splay nodes allocated together. */
splay_tree_node array;
/* Start of the target region. */
@@ -925,6 +942,10 @@ struct splay_tree_key_s {
"present increment" operations (via "acc enter data") refering to the same
host-memory block. */
uintptr_t virtual_refcount;
+#ifdef RC_CHECKING
+ /* The recalculated reference count, for verification. */
+ uintptr_t refcount_chk;
+#endif
/* For a block with attached pointers, the attachment counters for each. */
unsigned short *attach_count;
/* Pointer to the original mapping of "omp declare target link" object. */
@@ -958,6 +979,10 @@ typedef struct acc_dispatch_t
int nasyncqueue;
struct goacc_asyncqueue **asyncqueue;
struct goacc_asyncqueue_list *active;
+#ifdef RC_CHECKING
+ struct async_tgt_use *tgt_uses;
+ gomp_mutex_t au_lock;
+#endif
__typeof (GOMP_OFFLOAD_openacc_async_construct) *construct_func;
__typeof (GOMP_OFFLOAD_openacc_async_destruct) *destruct_func;
@@ -1085,6 +1110,12 @@ extern void gomp_detach_pointer (struct gomp_device_descr *,
struct goacc_asyncqueue *, splay_tree_key,
uintptr_t, bool, struct gomp_coalesce_buf *);
+#ifdef RC_CHECKING
+extern void dump_tgt (const char *, struct target_mem_desc *);
+extern void gomp_rc_check (struct gomp_device_descr *,
+ struct target_mem_desc *);
+#endif
+
extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
size_t, void **, void **,
size_t *, void *, bool,
diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c
index 077e28f..8b0f228 100644
--- a/libgomp/oacc-async.c
+++ b/libgomp/oacc-async.c
@@ -243,6 +243,29 @@ goacc_async_unmap_tgt (void *ptr)
{
struct target_mem_desc *tgt = (struct target_mem_desc *) ptr;
+#ifdef RC_CHECKING
+ {
+ struct gomp_device_descr *devicep = tgt->device_descr;
+ struct async_tgt_use *aup, *au;
+ gomp_mutex_lock (&devicep->openacc.async.au_lock);
+ /* Remove tgt from asynchronous-use list. */
+ for (aup = NULL, au = devicep->openacc.async.tgt_uses; au;
+ aup = au, au = au->next)
+ if (au->tgt == tgt)
+ {
+ if (aup)
+ aup->next = au->next;
+ else
+ devicep->openacc.async.tgt_uses = au->next;
+ free (au);
+ break;
+ }
+ if (!au)
+ gomp_fatal ("can't find tgt %p to remove in async list", tgt);
+ gomp_mutex_unlock (&devicep->openacc.async.au_lock);
+ }
+#endif
+
if (tgt->refcount > 1)
tgt->refcount--;
else
@@ -258,6 +281,18 @@ goacc_async_copyout_unmap_vars (struct target_mem_desc *tgt,
/* Increment reference to delay freeing of device memory until callback
has triggered. */
tgt->refcount++;
+
+#ifdef RC_CHECKING
+ {
+ struct async_tgt_use *au = malloc (sizeof (struct async_tgt_use));
+ gomp_mutex_lock (&devicep->openacc.async.au_lock);
+ /* Record the asynchronous use of this target_mem_desc. */
+ au->next = devicep->openacc.async.tgt_uses;
+ au->tgt = tgt;
+ devicep->openacc.async.tgt_uses = au;
+ gomp_mutex_unlock (&devicep->openacc.async.au_lock);
+ }
+#endif
gomp_unmap_vars_async (tgt, true, aq);
devicep->openacc.async.queue_callback_func (aq, goacc_async_unmap_tgt,
(void *) tgt);
@@ -276,6 +311,17 @@ goacc_remove_var_async (struct gomp_device_descr *devicep, splay_tree_key n,
struct target_mem_desc *tgt = n->tgt;
assert (tgt);
tgt->refcount++;
+#ifdef RC_CHECKING
+ {
+ gomp_mutex_lock (&devicep->openacc.async.au_lock);
+ struct async_tgt_use *au = malloc (sizeof (struct async_tgt_use));
+ /* Record the asynchronous use of this target_mem_desc. */
+ au->next = devicep->openacc.async.tgt_uses;
+ au->tgt = tgt;
+ devicep->openacc.async.tgt_uses = au;
+ gomp_mutex_unlock (&devicep->openacc.async.au_lock);
+ }
+#endif
gomp_remove_var (devicep, n);
devicep->openacc.async.queue_callback_func (aq, goacc_async_unmap_tgt,
(void *) tgt);
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 0e9a3e8..22aafb2 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -253,6 +253,15 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
true, GOMP_MAP_VARS_OPENACC);
+#ifdef RC_CHECKING
+ gomp_mutex_lock (&acc_dev->lock);
+ assert (tgt);
+ dump_tgt (__FUNCTION__, tgt);
+ tgt->prev = thr->mapped_data;
+ gomp_rc_check (acc_dev, tgt);
+ gomp_mutex_unlock (&acc_dev->lock);
+#endif
+
devaddrs = gomp_alloca (sizeof (void *) * mapnum);
for (i = 0; i < mapnum; i++)
devaddrs[i] = (void *) gomp_map_val (tgt, hostaddrs, i);
@@ -270,6 +279,12 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
dims, tgt, aq);
goacc_async_copyout_unmap_vars (tgt, aq);
}
+
+#ifdef RC_CHECKING
+ gomp_mutex_lock (&acc_dev->lock);
+ gomp_rc_check (acc_dev, thr->mapped_data);
+ gomp_mutex_unlock (&acc_dev->lock);
+#endif
}
/* Legacy entry point, only provide host execution. */
@@ -324,6 +339,12 @@ GOACC_data_start (int device, size_t mapnum,
gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__);
tgt->prev = thr->mapped_data;
thr->mapped_data = tgt;
+
+#ifdef RC_CHECKING
+ gomp_mutex_lock (&acc_dev->lock);
+ gomp_rc_check (acc_dev, thr->mapped_data);
+ gomp_mutex_unlock (&acc_dev->lock);
+#endif
}
void
@@ -336,6 +357,12 @@ GOACC_data_end (void)
thr->mapped_data = tgt->prev;
gomp_unmap_vars (tgt, true);
gomp_debug (0, " %s: mappings restored\n", __FUNCTION__);
+
+#ifdef RC_CHECKING
+ gomp_mutex_lock (&thr->dev->lock);
+ gomp_rc_check (thr->dev, thr->mapped_data);
+ gomp_mutex_unlock (&thr->dev->lock);
+#endif
}
void
@@ -624,6 +651,12 @@ GOACC_enter_exit_data (int device, size_t mapnum,
}
}
}
+
+#ifdef RC_CHECKING
+ gomp_mutex_lock (&acc_dev->lock);
+ gomp_rc_check (acc_dev, thr->mapped_data);
+ gomp_mutex_unlock (&acc_dev->lock);
+#endif
}
static void
diff --git a/libgomp/target.c b/libgomp/target.c
index 6e115d1..b9f8ce8 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -40,6 +40,9 @@
#include <assert.h>
#include <errno.h>
#include <limits.h>
+#ifdef RC_CHECKING
+#include <stdio.h>
+#endif
#ifdef PLUGIN_SUPPORT
#include <dlfcn.h>
@@ -360,6 +363,198 @@ gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr)
}
}
+#ifdef RC_CHECKING
+void
+dump_tgt (const char *where, struct target_mem_desc *tgt)
+{
+ if (!getenv ("GOMP_DEBUG_TGT"))
+ return;
+
+ fprintf (stderr, "%s: %s: tgt=%p\n", __FUNCTION__, where, (void*) tgt);
+ fprintf (stderr, "refcount=%d\n", (int) tgt->refcount);
+ fprintf (stderr, "tgt_start=%p\n", (void*) tgt->tgt_start);
+ fprintf (stderr, "tgt_end=%p\n", (void*) tgt->tgt_end);
+ fprintf (stderr, "to_free=%p\n", tgt->to_free);
+ fprintf (stderr, "list_count=%d\n", (int) tgt->list_count);
+ for (int i = 0; i < tgt->list_count; i++)
+ {
+ fprintf (stderr, "list item %d:\n", i);
+ fprintf (stderr, " key: %p\n", (void*) tgt->list[i].key);
+ if (tgt->list[i].key)
+ {
+ fprintf (stderr, " key.host_start=%p\n",
+ (void*) tgt->list[i].key->host_start);
+ fprintf (stderr, " key.host_end=%p\n",
+ (void*) tgt->list[i].key->host_end);
+ fprintf (stderr, " key.tgt=%p\n", (void*) tgt->list[i].key->tgt);
+ fprintf (stderr, " key.offset=%d\n",
+ (int) tgt->list[i].key->tgt_offset);
+ fprintf (stderr, " key.refcount=%d\n",
+ (int) tgt->list[i].key->refcount);
+ fprintf (stderr, " key.virtual_refcount=%d\n",
+ (int) tgt->list[i].key->virtual_refcount);
+ fprintf (stderr, " key.attach_count=%p\n",
+ (void*) tgt->list[i].key->attach_count);
+ fprintf (stderr, " key.link_key=%p\n",
+ (void*) tgt->list[i].key->link_key);
+ }
+ }
+ fprintf (stderr, "\n");
+}
+
+static void
+rc_check_clear (splay_tree_node node)
+{
+ splay_tree_key k = &node->key;
+
+ k->refcount_chk = 0;
+ k->tgt->refcount_chk = 0;
+ k->tgt->mark = false;
+
+ if (node->left)
+ rc_check_clear (node->left);
+ if (node->right)
+ rc_check_clear (node->right);
+}
+
+static void
+rc_check_count (splay_tree_node node)
+{
+ splay_tree_key k = &node->key;
+ struct target_mem_desc *t;
+
+ /* Add virtual reference counts ("acc enter data", etc.) for this key. */
+ k->refcount_chk += k->virtual_refcount;
+
+ t = k->tgt;
+ t->refcount_chk++;
+
+ if (!t->mark)
+ {
+ for (int i = 0; i < t->list_count; i++)
+ if (t->list[i].key)
+ t->list[i].key->refcount_chk++;
+
+ t->mark = true;
+ }
+
+ if (node->left)
+ rc_check_count (node->left);
+ if (node->right)
+ rc_check_count (node->right);
+}
+
+static bool
+rc_check_verify (splay_tree_node node, bool noisy, bool errors)
+{
+ splay_tree_key k = &node->key;
+ struct target_mem_desc *t;
+
+ if (k->refcount != REFCOUNT_INFINITY)
+ {
+ if (noisy)
+ fprintf (stderr, "key %p (%p..+%d): rc=%d/%d, virt_rc=%d\n", k,
+ (void *) k->host_start, (int) (k->host_end - k->host_start),
+ (int) k->refcount, (int) k->refcount_chk,
+ (int) k->virtual_refcount);
+
+ if (k->refcount != k->refcount_chk)
+ {
+ if (noisy)
+ fprintf (stderr, " -- key refcount mismatch!\n");
+ errors = true;
+ }
+
+ t = k->tgt;
+
+ if (noisy)
+ fprintf (stderr, "tgt %p: rc=%d/%d\n", t, (int) t->refcount,
+ (int) t->refcount_chk);
+
+ if (t->refcount != t->refcount_chk)
+ {
+ if (noisy)
+ fprintf (stderr,
+ " -- target memory descriptor refcount mismatch!\n");
+ errors = true;
+ }
+ }
+
+ if (node->left)
+ errors |= rc_check_verify (node->left, noisy, errors);
+ if (node->right)
+ errors |= rc_check_verify (node->right, noisy, errors);
+
+ return errors;
+}
+
+/* Call with device locked. */
+
+attribute_hidden void
+gomp_rc_check (struct gomp_device_descr *devicep, struct target_mem_desc *tgt)
+{
+ splay_tree sp = &devicep->mem_map;
+
+ bool noisy = getenv ("GOMP_DEBUG_TGT") != 0;
+
+ if (noisy)
+ fprintf (stderr, "\n*** GOMP_RC_CHECK ***\n\n");
+
+ if (sp->root)
+ {
+ gomp_mutex_lock (&devicep->openacc.async.au_lock);
+ struct async_tgt_use *async_uses = devicep->openacc.async.tgt_uses;
+
+ rc_check_clear (sp->root);
+
+ for (struct target_mem_desc *t = tgt; t; t = t->prev)
+ {
+ t->refcount_chk = 0;
+ t->mark = false;
+ }
+ for (struct async_tgt_use *au = async_uses; au; au = au->next)
+ {
+ struct target_mem_desc *t = au->tgt;
+ t->refcount_chk = 0;
+ t->mark = false;
+ }
+
+ /* Add references for interconnected splay-tree keys. */
+ rc_check_count (sp->root);
+
+ /* Add references for the tgt for a currently-executing kernel and/or
+ any enclosing data directives. */
+ for (struct target_mem_desc *t = tgt; t; t = t->prev)
+ {
+ t->refcount_chk++;
+
+ if (!t->mark)
+ {
+ for (int i = 0; i < t->list_count; i++)
+ if (t->list[i].key)
+ t->list[i].key->refcount_chk++;
+
+ t->mark = true;
+ }
+ }
+
+ /* Add references from in-progress asynchronous operations. */
+ for (struct async_tgt_use *au = async_uses; au; au = au->next)
+ {
+ struct target_mem_desc *t = au->tgt;
+ t->refcount_chk++;
+ }
+
+ if (rc_check_verify (sp->root, noisy, false))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("refcount checking failure");
+ }
+ gomp_mutex_unlock (&devicep->openacc.async.au_lock);
+ }
+}
+#endif
+
/* Handle the case where gomp_map_lookup, splay_tree_lookup or
gomp_map_0len_lookup found oldn for newn.
Helper function of gomp_map_vars. */
@@ -3274,6 +3469,10 @@ gomp_target_init (void)
current_device.type = current_device.get_type_func ();
current_device.mem_map.root = NULL;
current_device.state = GOMP_DEVICE_UNINITIALIZED;
+#ifdef RC_CHECKING
+ current_device.openacc.async.tgt_uses = NULL;
+ gomp_mutex_init (¤t_device.openacc.async.au_lock);
+#endif
/* Augment DEVICES and NUM_DEVICES. */
devices = gomp_realloc (devices,