[gcc/devel/omp/gcc-14] libgomp: parallel reverse offload
Paul-Antoine Arras
parras@gcc.gnu.org
Fri Jun 28 09:55:13 GMT 2024
https://gcc.gnu.org/g:e42de770d37967ec8db69022aac65a1351cb09ca
commit e42de770d37967ec8db69022aac65a1351cb09ca
Author: Andrew Stubbs <ams@codesourcery.com>
Date: Fri Sep 1 16:49:58 2023 +0100
libgomp: parallel reverse offload
Extend OpenMP reverse offload support to allow running the host kernels
on multiple threads. The device plugin API for reverse offload is now made
non-blocking, meaning that running the host kernel in the wrong device
context is no longer a problem. The NVPTX message passing interface now
uses a ring buffer aproximately matching GCN.
include/ChangeLog:
* gomp-constants.h (GOMP_VERSION): Bump.
libgomp/ChangeLog:
* config/gcn/target.c (GOMP_target_ext): Add "signal" field.
Fix atomics race condition.
* config/nvptx/libgomp-nvptx.h (REV_OFFLOAD_QUEUE_SIZE): New define.
(struct rev_offload): Implement ring buffer.
* config/nvptx/target.c (GOMP_target_ext): Likewise.
* env.c (initialize_env): Read GOMP_REVERSE_OFFLOAD_THREADS.
* libgomp-plugin.c (GOMP_PLUGIN_target_rev): Replace "aq" parameter
with "signal" and "use_aq".
* libgomp-plugin.h (GOMP_PLUGIN_target_rev): Likewise.
* libgomp.h (gomp_target_rev): Likewise.
* plugin/plugin-gcn.c (process_reverse_offload): Add "signal".
(console_output): Pass signal value through.
* plugin/plugin-nvptx.c (GOMP_OFFLOAD_openacc_async_construct):
Attach new threads to the numbered device.
Change the flag to CU_STREAM_NON_BLOCKING.
(GOMP_OFFLOAD_run): Implement ring-buffer and remove signalling.
* target.c (gomp_target_rev): Rename to ...
(gomp_target_rev_internal): ... this, and change "dev_num" to
"devicep".
(gomp_target_rev_worker_thread): New function.
(gomp_target_rev): New function (old name).
* libgomp.texi: Document GOMP_REVERSE_OFFLOAD_THREADS.
* testsuite/libgomp.c/reverse-offload-threads-1.c: New test.
* testsuite/libgomp.c/reverse-offload-threads-2.c: New test.
Diff:
---
libgomp/config/gcn/target.c | 39 ++++--
libgomp/config/nvptx/libgomp-nvptx.h | 35 ++++-
libgomp/config/nvptx/target.c | 53 +++++---
libgomp/env.c | 6 +
libgomp/libgomp-plugin.c | 4 +-
libgomp/libgomp-plugin.h | 2 +-
libgomp/libgomp.h | 3 +-
libgomp/libgomp.texi | 17 +++
libgomp/plugin/plugin-gcn.c | 8 +-
libgomp/plugin/plugin-nvptx.c | 68 ++++++++--
libgomp/target.c | 143 ++++++++++++++++++++-
.../libgomp.c/reverse-offload-threads-1.c | 26 ++++
.../libgomp.c/reverse-offload-threads-2.c | 31 +++++
13 files changed, 376 insertions(+), 59 deletions(-)
diff --git a/libgomp/config/gcn/target.c b/libgomp/config/gcn/target.c
index 1d4a23cb8d2..cc043c7b23a 100644
--- a/libgomp/config/gcn/target.c
+++ b/libgomp/config/gcn/target.c
@@ -103,19 +103,38 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
<= (index - 1024))
asm ("s_sleep 64");
+ /* In theory, it should be enough to write "written" with __ATOMIC_RELEASE,
+ and have the rest of the data flushed to memory automatically, but some
+ devices (gfx908) seem to have a race condition where the flushed data
+ arrives after the atomic data, and the host does the wrong thing.
+ If we just write everything atomically in the correct order then we're
+ safe. */
+
unsigned int slot = index % 1024;
- data->queue[slot].value_u64[0] = (uint64_t) fn;
- data->queue[slot].value_u64[1] = (uint64_t) mapnum;
- data->queue[slot].value_u64[2] = (uint64_t) hostaddrs;
- data->queue[slot].value_u64[3] = (uint64_t) sizes;
- data->queue[slot].value_u64[4] = (uint64_t) kinds;
- data->queue[slot].value_u64[5] = (uint64_t) GOMP_ADDITIONAL_ICVS.device_num;
-
- data->queue[slot].type = 4; /* Reverse offload. */
+ __atomic_store_n (&data->queue[slot].value_u64[0], (uint64_t) fn,
+ __ATOMIC_RELAXED);
+ __atomic_store_n (&data->queue[slot].value_u64[1], (uint64_t) mapnum,
+ __ATOMIC_RELAXED);
+ __atomic_store_n (&data->queue[slot].value_u64[2], (uint64_t) hostaddrs,
+ __ATOMIC_RELAXED);
+ __atomic_store_n (&data->queue[slot].value_u64[3], (uint64_t) sizes,
+ __ATOMIC_RELAXED);
+ __atomic_store_n (&data->queue[slot].value_u64[4], (uint64_t) kinds,
+ __ATOMIC_RELAXED);
+ __atomic_store_n (&data->queue[slot].value_u64[5],
+ (uint64_t) GOMP_ADDITIONAL_ICVS.device_num,
+ __ATOMIC_RELAXED);
+
+ volatile int signal = 0;
+ __atomic_store_n (&data->queue[slot].value_u64[6], (uint64_t) &signal,
+ __ATOMIC_RELAXED);
+
+ __atomic_store_n (&data->queue[slot].type, 4 /* Reverse offload. */,
+ __ATOMIC_RELAXED);
__atomic_store_n (&data->queue[slot].written, 1, __ATOMIC_RELEASE);
- /* Spinlock while the host catches up. */
- while (__atomic_load_n (&data->queue[slot].written, __ATOMIC_ACQUIRE) != 0)
+ /* Spinlock while the host runs the kernel. */
+ while (__atomic_load_n (&signal, __ATOMIC_ACQUIRE) == 0)
asm ("s_sleep 64");
}
diff --git a/libgomp/config/nvptx/libgomp-nvptx.h b/libgomp/config/nvptx/libgomp-nvptx.h
index c5a54b220a5..c4a4a7fa487 100644
--- a/libgomp/config/nvptx/libgomp-nvptx.h
+++ b/libgomp/config/nvptx/libgomp-nvptx.h
@@ -25,20 +25,41 @@
/* This file contains defines and type definitions shared between the
nvptx target's libgomp.a and the plugin-nvptx.c, but that is only
- needef for this target. */
+ needed for this target. */
#ifndef LIBGOMP_NVPTX_H
#define LIBGOMP_NVPTX_H 1
#define GOMP_REV_OFFLOAD_VAR __gomp_rev_offload_var
+#define REV_OFFLOAD_QUEUE_SIZE 1024
struct rev_offload {
- uint64_t fn;
- uint64_t mapnum;
- uint64_t addrs;
- uint64_t sizes;
- uint64_t kinds;
- int32_t dev_num;
+ /* The target can grab a slot by incrementing "next_slot".
+ Each host thread may claim some slots for processing.
+ When the host processing is completed "consumed" indicates that the
+ corresponding slots in the ring-buffer "queue" are available for reuse.
+
+ Note that "next_slot" is an index, and "consumed"/"claimed" are counters,
+ so beware of the fence-posts. */
+ unsigned int next_slot;
+ unsigned int consumed;
+ unsigned int claimed;
+
+ struct rev_req {
+ /* The target writes an address to "signal" as the last item, which
+ indicates to the host that the record is completely written. The target
+ must not assume that it still owns the slot, after that. The signal
+ address is then used by the host to communicate that the reverse-offload
+ kernel has completed execution. */
+ volatile int *signal;
+
+ uint64_t fn;
+ uint64_t mapnum;
+ uint64_t addrs;
+ uint64_t sizes;
+ uint64_t kinds;
+ int32_t dev_num;
+ } queue[REV_OFFLOAD_QUEUE_SIZE];
};
#if (__SIZEOF_SHORT__ != 2 \
diff --git a/libgomp/config/nvptx/target.c b/libgomp/config/nvptx/target.c
index f14dcfc8ff6..3db29d4af31 100644
--- a/libgomp/config/nvptx/target.c
+++ b/libgomp/config/nvptx/target.c
@@ -93,7 +93,6 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
void **hostaddrs, size_t *sizes, unsigned short *kinds,
unsigned int flags, void **depend, void **args)
{
- static int lock = 0; /* == gomp_mutex_t lock; gomp_mutex_init (&lock); */
(void) flags;
(void) depend;
(void) args;
@@ -103,43 +102,57 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
|| GOMP_REV_OFFLOAD_VAR == NULL)
return;
- gomp_mutex_lock (&lock);
-
- GOMP_REV_OFFLOAD_VAR->mapnum = mapnum;
- GOMP_REV_OFFLOAD_VAR->addrs = (uint64_t) hostaddrs;
- GOMP_REV_OFFLOAD_VAR->sizes = (uint64_t) sizes;
- GOMP_REV_OFFLOAD_VAR->kinds = (uint64_t) kinds;
- GOMP_REV_OFFLOAD_VAR->dev_num = GOMP_ADDITIONAL_ICVS.device_num;
-
- /* Set 'fn' to trigger processing on the host; wait for completion,
- which is flagged by setting 'fn' back to 0 on the host. */
- uint64_t addr_struct_fn = (uint64_t) &GOMP_REV_OFFLOAD_VAR->fn;
+ /* Reserve one slot. */
+ unsigned int index = __atomic_fetch_add (&GOMP_REV_OFFLOAD_VAR->next_slot,
+ 1, __ATOMIC_ACQUIRE);
+
+ if ((unsigned int) (index + 1) < GOMP_REV_OFFLOAD_VAR->consumed)
+ abort (); /* Overflow. */
+
+ /* Spinlock while the host catches up. */
+ if (index >= REV_OFFLOAD_QUEUE_SIZE)
+ while (__atomic_load_n (&GOMP_REV_OFFLOAD_VAR->consumed, __ATOMIC_ACQUIRE)
+ <= (index - REV_OFFLOAD_QUEUE_SIZE))
+ ; /* spin */
+
+ unsigned int slot = index % REV_OFFLOAD_QUEUE_SIZE;
+ GOMP_REV_OFFLOAD_VAR->queue[slot].fn = (uint64_t) fn;
+ GOMP_REV_OFFLOAD_VAR->queue[slot].mapnum = mapnum;
+ GOMP_REV_OFFLOAD_VAR->queue[slot].addrs = (uint64_t) hostaddrs;
+ GOMP_REV_OFFLOAD_VAR->queue[slot].sizes = (uint64_t) sizes;
+ GOMP_REV_OFFLOAD_VAR->queue[slot].kinds = (uint64_t) kinds;
+ GOMP_REV_OFFLOAD_VAR->queue[slot].dev_num = GOMP_ADDITIONAL_ICVS.device_num;
+
+ /* Set 'signal' to trigger processing on the host; the slot is now consumed
+ by the host, so we should not touch it again. */
+ volatile int signal = 0;
+ uint64_t addr_struct_signal = (uint64_t) &GOMP_REV_OFFLOAD_VAR->queue[slot].signal;
#if __PTX_SM__ >= 700
asm volatile ("st.global.release.sys.u64 [%0], %1;"
- : : "r"(addr_struct_fn), "r" (fn) : "memory");
+ : : "r"(addr_struct_signal), "r" (&signal) : "memory");
#else
__sync_synchronize (); /* membar.sys */
asm volatile ("st.volatile.global.u64 [%0], %1;"
- : : "r"(addr_struct_fn), "r" (fn) : "memory");
+ : : "r"(addr_struct_signal), "r" (&signal) : "memory");
#endif
+ /* The host signals completion by writing a non-zero value to the 'signal'
+ variable. */
#if __PTX_SM__ >= 700
- uint64_t fn2;
+ uint64_t signal2;
do
{
asm volatile ("ld.acquire.sys.global.u64 %0, [%1];"
- : "=r" (fn2) : "r" (addr_struct_fn) : "memory");
+ : "=r" (signal2) : "r" (&signal) : "memory");
}
- while (fn2 != 0);
+ while (signal2 == 0);
#else
/* ld.global.u64 %r64,[__gomp_rev_offload_var];
ld.u64 %r36,[%r64];
membar.sys; */
- while (__atomic_load_n (&GOMP_REV_OFFLOAD_VAR->fn, __ATOMIC_ACQUIRE) != 0)
+ while (__atomic_load_n (&signal, __ATOMIC_ACQUIRE) == 0)
; /* spin */
#endif
-
- gomp_mutex_unlock (&lock);
}
void
diff --git a/libgomp/env.c b/libgomp/env.c
index 47d8c637b66..9165ba45f85 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -124,6 +124,7 @@ size_t gomp_affinity_format_len;
char *goacc_device_type;
int goacc_device_num;
int goacc_default_dims[GOMP_DIM_MAX];
+int gomp_reverse_offload_threads = 8; /* Reasonable default. */
#ifndef LIBGOMP_OFFLOADED_ONLY
@@ -2488,6 +2489,11 @@ initialize_env (void)
handle_omp_display_env ();
+ /* Control the number of background threads reverse offload is permitted
+ to use. */
+ parse_int_secure ("GOMP_REVERSE_OFFLOAD_THREADS",
+ &gomp_reverse_offload_threads, false);
+
/* OpenACC. */
if (!parse_int ("ACC_DEVICE_NUM", getenv ("ACC_DEVICE_NUM"),
diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c
index febc94d36f9..08226347bc8 100644
--- a/libgomp/libgomp-plugin.c
+++ b/libgomp/libgomp-plugin.c
@@ -82,8 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
void
GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
- struct goacc_asyncqueue *aq)
+ volatile int *signal, bool use_aq)
{
gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
- aq);
+ signal, use_aq);
}
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 0d2e3f0a6ec..180740b62cd 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -123,7 +123,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
__attribute__ ((noreturn, format (printf, 1, 2)));
extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
- uint64_t, int, struct goacc_asyncqueue *);
+ uint64_t, int, volatile int *, bool);
/* Prototypes for functions implemented by libgomp plugins. */
extern const char *GOMP_OFFLOAD_get_name (void);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index ea7e06b6d7b..88a690a4867 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -613,6 +613,7 @@ extern struct gomp_offload_icv_list *gomp_offload_icv_list;
extern int goacc_device_num;
extern char *goacc_device_type;
extern int goacc_default_dims[GOMP_DIM_MAX];
+extern int gomp_reverse_offload_threads;
enum gomp_task_kind
{
@@ -1127,7 +1128,7 @@ extern void gomp_init_targets_once (void);
extern int gomp_get_num_devices (void);
extern bool gomp_target_task_fn (void *);
extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
- int, struct goacc_asyncqueue *);
+ int, volatile int *, bool);
extern bool gomp_page_locked_host_alloc (void **, size_t);
extern void gomp_page_locked_host_free (void *);
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index bc105464208..d635a324f8b 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -3403,6 +3403,7 @@ variable is not set.
* GOMP_STACKSIZE:: Set default thread stack size
* GOMP_SPINCOUNT:: Set the busy-wait spin count
* GOMP_RTEMS_THREAD_POOLS:: Set the RTEMS specific thread pools
+* GOMP_REVERSE_OFFLOAD_THREADS:: Set the maximum number of host threads
@end menu
@@ -4163,6 +4164,22 @@ pools available and their worker threads run at priority four.
+@node GOMP_REVERSE_OFFLOAD_THREADS
+@section @env{GOMP_REVERSE_OFFLOAD_THREADS} -- Set the maximum number of host threads
+@cindex Environment Variable
+@table @asis
+@item @emph{Description}
+Set the maximum number of threads that may be used to run reverse offload
+code sections (host code nested within offload regions, declared using
+@code{#pragma omp target device(ancestor:1)}). The value should be a non-zero
+positive integer. The default is 8 threads.
+
+The threads are created on demand, up to the maximum number given, and are
+destroyed when no reverse offload requests remain.
+@end table
+
+
+
@c ---------------------------------------------------------------------
@c Enabling OpenACC
@c ---------------------------------------------------------------------
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 4988dfb0c8e..6d5d7d4335a 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -2066,11 +2066,12 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
static void
process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
- uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
+ uint64_t sizes, uint64_t kinds, uint64_t dev_num64,
+ uint64_t signal)
{
int dev_num = dev_num64;
GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
- NULL);
+ (volatile int *) signal, false);
}
/* Output any data written to console output from the kernel. It is expected
@@ -2120,7 +2121,8 @@ console_output (struct kernel_info *kernel, struct kernargs *kernargs,
case 4:
process_reverse_offload (data->value_u64[0], data->value_u64[1],
data->value_u64[2], data->value_u64[3],
- data->value_u64[4], data->value_u64[5]);
+ data->value_u64[4], data->value_u64[5],
+ data->value_u64[6]);
break;
default: printf ("GCN print buffer error!\n"); break;
}
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 84bac0beafd..e4672afe123 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1913,9 +1913,10 @@ nvptx_goacc_asyncqueue_construct (unsigned int flags)
}
struct goacc_asyncqueue *
-GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+GOMP_OFFLOAD_openacc_async_construct (int device)
{
- return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
+ nvptx_attach_host_thread_to_device (device);
+ return nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
}
static bool
@@ -2475,17 +2476,68 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
else if (r != CUDA_ERROR_NOT_READY)
GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
- if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
+ struct rev_offload *rev_metadata = ptx_dev->rev_data;
+
+ /* Claim a portion of the ring buffer to process on this iteration.
+ Don't mark them as consumed until all the data has been read out. */
+ unsigned int consumed = __atomic_load_n (&rev_metadata->consumed,
+ __ATOMIC_ACQUIRE);
+ unsigned int from = __atomic_load_n (&rev_metadata->claimed,
+ __ATOMIC_RELAXED);
+ unsigned int to = __atomic_load_n (&rev_metadata->next_slot,
+ __ATOMIC_RELAXED);
+
+ if (consumed > to)
+ {
+ /* Overflow happens when we exceed UINTMAX requests. */
+ GOMP_PLUGIN_fatal ("NVPTX reverse offload buffer overflowed.\n");
+ }
+
+ to = MIN(to, consumed + REV_OFFLOAD_QUEUE_SIZE / 2);
+ if (to <= from)
+ /* Nothing to do; poll again. */
+ goto poll_again;
+
+ if (!__atomic_compare_exchange_n (&rev_metadata->claimed, &from, to,
+ false,
+ __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
+ /* Collision with another thread ... go around again. */
+ goto poll_again;
+
+ unsigned int index;
+ for (index = from; index < to; index++)
{
- struct rev_offload *rev_data = ptx_dev->rev_data;
+ int slot = index % REV_OFFLOAD_QUEUE_SIZE;
+
+ /* Wait while the target finishes filling in the slot. */
+ while (__atomic_load_n (&ptx_dev->rev_data->queue[slot].signal,
+ __ATOMIC_ACQUIRE) == 0)
+ ; /* spin */
+
+ /* Pass the request to libgomp; this will queue the request and
+ return right away, without waiting for the kernel to run. */
+ struct rev_req *rev_data = &ptx_dev->rev_data->queue[slot];
GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
rev_data->addrs, rev_data->sizes,
rev_data->kinds, rev_data->dev_num,
- reverse_offload_aq);
- if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
- exit (EXIT_FAILURE);
- __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
+ rev_data->signal, true);
+
+ /* Ensure that the slot doesn't trigger early, when reused. */
+ __atomic_store_n (&rev_data->signal, 0, __ATOMIC_RELEASE);
}
+
+ /* The data is now consumed so release the slots for reuse. */
+ unsigned int consumed_so_far = from;
+ while (!__atomic_compare_exchange_n (&rev_metadata->consumed,
+ &consumed_so_far, to, false,
+ __ATOMIC_RELEASE, __ATOMIC_RELAXED))
+ {
+ /* Another thread didn't consume all it claimed yet.... */
+ consumed_so_far = from;
+ usleep (1);
+ }
+
+poll_again:
usleep (1);
}
else
diff --git a/libgomp/target.c b/libgomp/target.c
index a75cd751b39..c28c3e1e5bb 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -3645,16 +3645,18 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
tgt_start, tgt_end);
}
-/* Handle reverse offload. This is called by the device plugins for a
- reverse offload; it is not called if the outer target runs on the host.
+/* Handle reverse offload. This is called by the host worker thread to
+ execute a single reverse offload request; it is not called if the outer
+ target runs on the host.
The mapping is simplified device-affecting constructs (except for target
with device(ancestor:1)) must not be encountered; in particular not
target (enter/exit) data. */
-void
-gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
- uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
- struct goacc_asyncqueue *aq)
+static void
+gomp_target_rev_internal (uint64_t fn_ptr, uint64_t mapnum,
+ uint64_t devaddrs_ptr, uint64_t sizes_ptr,
+ uint64_t kinds_ptr, struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq)
{
/* Return early if there is no offload code. */
if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
@@ -3671,7 +3673,6 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
unsigned short *kinds;
const bool short_mapkind = true;
const int typemask = short_mapkind ? 0xff : 0x7;
- struct gomp_device_descr *devicep = resolve_device (dev_num, false);
reverse_splay_tree_key n;
struct reverse_splay_tree_key_s k;
@@ -4082,6 +4083,134 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
}
}
+static struct target_rev_queue_s
+{
+ uint64_t fn_ptr;
+ uint64_t mapnum;
+ uint64_t devaddrs_ptr;
+ uint64_t sizes_ptr;
+ uint64_t kinds_ptr;
+ struct gomp_device_descr *devicep;
+
+ volatile int *signal;
+ bool use_aq;
+
+ struct target_rev_queue_s *next;
+} *target_rev_queue_head = NULL, *target_rev_queue_last = NULL;
+static gomp_mutex_t target_rev_queue_lock = 0;
+static int target_rev_thread_count = 0;
+
+static void *
+gomp_target_rev_worker_thread (void *)
+{
+ struct target_rev_queue_s *rev_kernel = NULL;
+ struct goacc_asyncqueue *aq = NULL;
+ struct gomp_device_descr *aq_devicep;
+
+ while (1)
+ {
+ gomp_mutex_lock (&target_rev_queue_lock);
+
+ /* Take a reverse-offload kernel request from the queue. */
+ rev_kernel = target_rev_queue_head;
+ if (rev_kernel)
+ {
+ target_rev_queue_head = rev_kernel->next;
+ if (target_rev_queue_head == NULL)
+ target_rev_queue_last = NULL;
+ }
+
+ if (rev_kernel == NULL)
+ {
+ target_rev_thread_count--;
+ gomp_mutex_unlock (&target_rev_queue_lock);
+ break;
+ }
+ gomp_mutex_unlock (&target_rev_queue_lock);
+
+ /* Ensure we have a suitable device queue for the memory transfers. */
+ if (rev_kernel->use_aq)
+ {
+ if (aq && aq_devicep != rev_kernel->devicep)
+ {
+ aq_devicep->openacc.async.destruct_func (aq);
+ aq = NULL;
+ }
+
+ if (!aq)
+ {
+ aq_devicep = rev_kernel->devicep;
+ aq = aq_devicep->openacc.async.construct_func (aq_devicep->target_id);
+ }
+ }
+
+ /* Run the kernel on the host. */
+ gomp_target_rev_internal (rev_kernel->fn_ptr, rev_kernel->mapnum,
+ rev_kernel->devaddrs_ptr, rev_kernel->sizes_ptr,
+ rev_kernel->kinds_ptr, rev_kernel->devicep, aq);
+
+ /* Signal the device that the reverse-offload is completed. */
+ int one = 1;
+ gomp_copy_host2dev (rev_kernel->devicep, aq, (void*)rev_kernel->signal,
+ &one, sizeof (one), false, NULL);
+
+ /* We're done with this request. */
+ free (rev_kernel);
+
+ /* Loop around and see if another request is waiting. */
+ }
+
+ if (aq)
+ aq_devicep->openacc.async.destruct_func (aq);
+
+ return NULL;
+}
+
+void
+gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
+ uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
+ volatile int *signal, bool use_aq)
+{
+ struct gomp_device_descr *devicep = resolve_device (dev_num, false);
+
+ /* Create a new queue node. */
+ struct target_rev_queue_s *newreq = gomp_malloc (sizeof (*newreq));
+ newreq->fn_ptr = fn_ptr;
+ newreq->mapnum = mapnum;
+ newreq->devaddrs_ptr = devaddrs_ptr;
+ newreq->sizes_ptr = sizes_ptr;
+ newreq->kinds_ptr = kinds_ptr;
+ newreq->devicep = devicep;
+ newreq->signal = signal;
+ newreq->use_aq = use_aq;
+ newreq->next = NULL;
+
+ gomp_mutex_lock (&target_rev_queue_lock);
+
+ /* Enqueue the reverse-offload request. */
+ if (target_rev_queue_last)
+ {
+ target_rev_queue_last->next = newreq;
+ target_rev_queue_last = newreq;
+ }
+ else
+ target_rev_queue_last = target_rev_queue_head = newreq;
+
+ /* Launch a new thread to process the request asynchronously.
+ If the thread pool limit has been reached then an existing thread will
+ pick up the job when it is ready. */
+ if (target_rev_thread_count < gomp_reverse_offload_threads)
+ {
+ target_rev_thread_count++;
+ gomp_mutex_unlock (&target_rev_queue_lock);
+
+ pthread_t t;
+ pthread_create (&t, NULL, gomp_target_rev_worker_thread, NULL);
+ }
+ else
+ gomp_mutex_unlock (&target_rev_queue_lock);
+}
+
/* Host fallback for GOMP_target_data{,_ext} routines. */
static void
diff --git a/libgomp/testsuite/libgomp.c/reverse-offload-threads-1.c b/libgomp/testsuite/libgomp.c/reverse-offload-threads-1.c
new file mode 100644
index 00000000000..fa74a8e9668
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/reverse-offload-threads-1.c
@@ -0,0 +1,26 @@
+/* { dg-do run } */
+/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
+
+/* Test that the reverse offload message buffers can cope with a lot of
+ requests. */
+
+#pragma omp requires reverse_offload
+
+int main ()
+{
+ #pragma omp target teams distribute parallel for collapse(2)
+ for (int i=0; i < 100; i++)
+ for (int j=0; j < 16; j++)
+ {
+ int val = 0;
+ #pragma omp target device ( ancestor:1 ) firstprivate(i,j) map(from:val)
+ {
+ val = i + j;
+ }
+
+ if (val != i + j)
+ __builtin_abort ();
+ }
+
+ return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/reverse-offload-threads-2.c b/libgomp/testsuite/libgomp.c/reverse-offload-threads-2.c
new file mode 100644
index 00000000000..05a2571ed14
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/reverse-offload-threads-2.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
+
+/* Test that the reverse offload message buffers can cope with multiple
+ requests from multiple kernels. */
+
+#pragma omp requires reverse_offload
+
+int main ()
+{
+ for (int n=0; n < 5; n++)
+ {
+ #pragma omp target teams distribute parallel for nowait collapse(2)
+ for (int i=0; i < 32; i++)
+ for (int j=0; j < 16; j++)
+ {
+ int val = 0;
+ #pragma omp target device ( ancestor:1 ) firstprivate(i,j) map(from:val)
+ {
+ val = i + j;
+ }
+
+ if (val != i + j)
+ __builtin_abort ();
+ }
+ }
+
+#pragma omp taskwait
+
+ return 0;
+}
More information about the Gcc-cvs
mailing list