This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [gomp4.1] depend nowait support for target {update,{enter,exit} data}
- From: Ilya Verbin <iverbin at gmail dot com>
- To: Jakub Jelinek <jakub at redhat dot com>
- Cc: Aldy Hernandez <aldyh at redhat dot com>, gcc-patches at gcc dot gnu dot org, Kirill Yukhin <kirill dot yukhin at gmail dot com>
- Date: Fri, 2 Oct 2015 22:28:01 +0300
- Subject: Re: [gomp4.1] depend nowait support for target {update,{enter,exit} data}
- Authentication-results: sourceware.org; auth=none
- References: <20150908092014 dot GA1847 at tucnak dot redhat dot com>
Hi!
On Tue, Sep 08, 2015 at 11:20:14 +0200, Jakub Jelinek wrote:
> nowait support for #pragma omp target is not implemented yet, supposedly we
> need to mark those somehow (some flag) already in the struct gomp_task
> structure, essentially it will need either 2 or 3 callbacks
> (the current one, executed when the dependencies are resolved (it actually
> waits until some thread schedules it after that point, I think it is
> undesirable to run it with the tasking lock held), which would perform
> the gomp_map_vars and initiate the running of the region, and then some
> query routine which would poll the plugin whether the task is done or not,
> and either perform the finalization (unmap_vars) if it is done (and in any
> case return bool whether it should be polled again or not), and if the
> finalization is not done there, also another callback for the finalization.
> Also, there is the issue that if we are waiting for task that needs to be
> polled, and we don't have any further tasks to run, we shouldn't really
> attempt to sleep on some semaphore (e.g. in taskwait, end of
> taskgroup, etc.) or barrier, but rather either need to keep polling it, or
> call the query hook with some argument that it should sleep in there until
> the work is done by the offloading device.
> Also, there needs to be a way for the target nowait first callback to say
> that it is using host fallback and thus acts as a normal task, therefore
> once the task fn finishes, the task is done.
Here is my WIP patch. target.c part is obviously incorrect, but it demonstrates
a possible libgomp <-> plugin interface for running a target task function
asynchronously and checking whether it is completed or not.
(Refactored liboffloadmic/runtime/emulator from trunk is required to run
target-tmp.c testcase.)
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index d798321..8e2b5aa 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -872,6 +872,8 @@ struct gomp_device_descr
void *(*host2dev_func) (int, void *, const void *, size_t);
void *(*dev2dev_func) (int, void *, const void *, size_t);
void (*run_func) (int, void *, void *);
+ void (*async_run_func) (int, void *, void *, const void *);
+ bool (*async_is_completed_func) (int, const void *);
/* Splay tree containing information about mapped memory regions. */
struct splay_tree_s mem_map;
diff --git a/libgomp/target.c b/libgomp/target.c
index 77bd442..31f034c 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -45,6 +45,10 @@
#include "plugin-suffix.h"
#endif
+/* FIXME: TMP */
+#include <stdio.h>
+#include <unistd.h>
+
static void gomp_target_init (void);
/* The whole initialization code for offloading plugins is only run one. */
@@ -1227,6 +1231,44 @@ gomp_target_fallback (void (*fn) (void *), void **hostaddrs)
*thr = old_thr;
}
+/* Host fallback with firstprivate map-type handling. */
+
+static void
+gomp_target_fallback_firstprivate (void (*fn) (void *), size_t mapnum,
+ void **hostaddrs, size_t *sizes,
+ unsigned short *kinds)
+{
+ size_t i, tgt_align = 0, tgt_size = 0;
+ char *tgt = NULL;
+ for (i = 0; i < mapnum; i++)
+ if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+ {
+ size_t align = (size_t) 1 << (kinds[i] >> 8);
+ if (tgt_align < align)
+ tgt_align = align;
+ tgt_size = (tgt_size + align - 1) & ~(align - 1);
+ tgt_size += sizes[i];
+ }
+ if (tgt_align)
+ {
+ tgt = gomp_alloca (tgt_size + tgt_align - 1);
+ uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
+ if (al)
+ tgt += tgt_align - al;
+ tgt_size = 0;
+ for (i = 0; i < mapnum; i++)
+ if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+ {
+ size_t align = (size_t) 1 << (kinds[i] >> 8);
+ tgt_size = (tgt_size + align - 1) & ~(align - 1);
+ memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
+ hostaddrs[i] = tgt + tgt_size;
+ tgt_size = tgt_size + sizes[i];
+ }
+ }
+ gomp_target_fallback (fn, hostaddrs);
+}
+
/* Helper function of GOMP_target{,_41} routines. */
static void *
@@ -1311,40 +1353,19 @@ GOMP_target_41 (int device, void (*fn) (void *), size_t mapnum,
if (devicep == NULL
|| !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
{
- size_t i, tgt_align = 0, tgt_size = 0;
- char *tgt = NULL;
- for (i = 0; i < mapnum; i++)
- if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
- {
- size_t align = (size_t) 1 << (kinds[i] >> 8);
- if (tgt_align < align)
- tgt_align = align;
- tgt_size = (tgt_size + align - 1) & ~(align - 1);
- tgt_size += sizes[i];
- }
- if (tgt_align)
- {
- tgt = gomp_alloca (tgt_size + tgt_align - 1);
- uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
- if (al)
- tgt += tgt_align - al;
- tgt_size = 0;
- for (i = 0; i < mapnum; i++)
- if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
- {
- size_t align = (size_t) 1 << (kinds[i] >> 8);
- tgt_size = (tgt_size + align - 1) & ~(align - 1);
- memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
- hostaddrs[i] = tgt + tgt_size;
- tgt_size = tgt_size + sizes[i];
- }
- }
- gomp_target_fallback (fn, hostaddrs);
+ gomp_target_fallback_firstprivate (fn, mapnum, hostaddrs, sizes, kinds);
return;
}
void *fn_addr = gomp_get_target_fn_addr (devicep, fn);
+ if (flags & GOMP_TARGET_FLAG_NOWAIT)
+ {
+ gomp_create_target_task (devicep, fn_addr, mapnum, hostaddrs, sizes,
+ kinds, flags, depend);
+ return;
+ }
+
struct target_mem_desc *tgt_vars
= gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, true,
GOMP_MAP_VARS_TARGET);
@@ -1636,34 +1657,58 @@ void
gomp_target_task_fn (void *data)
{
struct gomp_target_task *ttask = (struct gomp_target_task *) data;
+ struct gomp_device_descr *devicep = ttask->devicep;
+
if (ttask->fn != NULL)
{
- /* GOMP_target_41 */
+ if (devicep == NULL
+ || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+ {
+ /* FIXME: Save host fn addr into gomp_target_task? */
+ gomp_target_fallback_firstprivate (NULL, ttask->mapnum,
+ ttask->hostaddrs, ttask->sizes,
+ ttask->kinds);
+ return;
+ }
+
+ struct target_mem_desc *tgt_vars
+ = gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs, NULL,
+ ttask->sizes, ttask->kinds, true,
+ GOMP_MAP_VARS_TARGET);
+ devicep->async_run_func (devicep->target_id, ttask->fn,
+ (void *) tgt_vars->tgt_start, data);
+
+ /* FIXME: TMP example of checking for completion.
+ Alternatively the plugin can set some completion flag in ttask. */
+ while (!devicep->async_is_completed_func (devicep->target_id, data))
+ {
+ fprintf (stderr, "-");
+ usleep (100000);
+ }
}
- else if (ttask->devicep == NULL
- || !(ttask->devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+ else if (devicep == NULL
+ || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
return;
size_t i;
if (ttask->flags & GOMP_TARGET_FLAG_UPDATE)
- gomp_update (ttask->devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+ gomp_update (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
ttask->kinds, true);
else if ((ttask->flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
for (i = 0; i < ttask->mapnum; i++)
if ((ttask->kinds[i] & 0xff) == GOMP_MAP_STRUCT)
{
- gomp_map_vars (ttask->devicep, ttask->sizes[i] + 1,
- &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
- &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
+ gomp_map_vars (devicep, ttask->sizes[i] + 1, &ttask->hostaddrs[i],
+ NULL, &ttask->sizes[i], &ttask->kinds[i], true,
+ GOMP_MAP_VARS_ENTER_DATA);
i += ttask->sizes[i];
}
else
- gomp_map_vars (ttask->devicep, 1, &ttask->hostaddrs[i], NULL,
- &ttask->sizes[i], &ttask->kinds[i],
- true, GOMP_MAP_VARS_ENTER_DATA);
+ gomp_map_vars (devicep, 1, &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
+ &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
else
- gomp_exit_data (ttask->devicep, ttask->mapnum, ttask->hostaddrs,
- ttask->sizes, ttask->kinds);
+ gomp_exit_data (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+ ttask->kinds);
}
void
@@ -2108,6 +2153,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
{
DLSYM (run);
+ DLSYM (async_run);
+ DLSYM (async_is_completed);
DLSYM (dev2dev);
}
if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
diff --git a/libgomp/testsuite/libgomp.c/target-tmp.c b/libgomp/testsuite/libgomp.c/target-tmp.c
new file mode 100644
index 0000000..23a739c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-tmp.c
@@ -0,0 +1,40 @@
+#include <stdio.h>
+#include <unistd.h>
+
+#pragma omp declare target
+void foo (int n)
+{
+ printf ("Start tgt %d\n", n);
+ usleep (5000000);
+ printf ("End tgt %d\n", n);
+}
+#pragma omp end declare target
+
+int x, y, z;
+
+int main ()
+{
+ #pragma omp parallel
+ #pragma omp single
+ {
+ #pragma omp task depend(out: x)
+ printf ("Host task\n");
+
+ #pragma omp target nowait depend(in: x) depend(out: y)
+ foo (1);
+
+ #pragma omp target nowait depend(in: y)
+ foo (2);
+
+ #pragma omp target nowait depend(in: y)
+ foo (3);
+
+ while (1)
+ {
+ usleep (333333);
+ fprintf (stderr, ".");
+ }
+ }
+
+ return 0;
+}
diff --git a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
index 26ac6fe..c843710 100644
--- a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
+++ b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
@@ -34,6 +34,7 @@
#include <string.h>
#include <utility>
#include <vector>
+#include <set>
#include <map>
#include "libgomp-plugin.h"
#include "compiler_if_host.h"
@@ -76,9 +77,15 @@ static int num_images;
second key is number of device. Contains a vector of pointer pairs. */
static ImgDevAddrMap *address_table;
+/* Set of asynchronously running target tasks. */
+static std::set<const void *> *async_tasks;
+
/* Thread-safe registration of the main image. */
static pthread_once_t main_image_is_registered = PTHREAD_ONCE_INIT;
+/* Mutex for protecting async_tasks. */
+static pthread_mutex_t async_tasks_lock = PTHREAD_MUTEX_INITIALIZER;
+
static VarDesc vd_host2tgt = {
{ 1, 1 }, /* dst, src */
{ 1, 0 }, /* in, out */
@@ -156,6 +163,8 @@ init (void)
out:
address_table = new ImgDevAddrMap;
+ async_tasks = new std::set<const void *>;
+ pthread_mutex_init (&async_tasks_lock, NULL);
num_devices = _Offload_number_of_devices ();
}
@@ -192,11 +201,27 @@ GOMP_OFFLOAD_get_num_devices (void)
static void
offload (const char *file, uint64_t line, int device, const char *name,
- int num_vars, VarDesc *vars, VarDesc2 *vars2)
+ int num_vars, VarDesc *vars, VarDesc2 *vars2, const void *async_data)
{
OFFLOAD ofld = __offload_target_acquire1 (&device, file, line);
if (ofld)
- __offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL, NULL);
+ {
+ if (async_data == NULL)
+ __offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL,
+ NULL);
+ else
+ {
+ pthread_mutex_lock (&async_tasks_lock);
+ async_tasks->insert (async_data);
+ pthread_mutex_unlock (&async_tasks_lock);
+
+ OffloadFlags flags;
+ flags.flags = 0;
+ flags.bits.omp_async = 1;
+ __offload_offload3 (ofld, name, 0, num_vars, vars, NULL, 0, NULL,
+ (const void **) async_data, 0, NULL, flags, NULL);
+ }
+ }
else
{
fprintf (stderr, "%s:%d: Offload target acquire failed\n", file, line);
@@ -218,7 +243,7 @@ GOMP_OFFLOAD_init_device (int device)
TRACE ("");
pthread_once (&main_image_is_registered, register_main_image);
offload (__FILE__, __LINE__, device, "__offload_target_init_proc", 0,
- NULL, NULL);
+ NULL, NULL, NULL);
}
extern "C" void
@@ -240,7 +265,7 @@ get_target_table (int device, int &num_funcs, int &num_vars, void **&table)
VarDesc2 vd1g[2] = { { "num_funcs", 0 }, { "num_vars", 0 } };
offload (__FILE__, __LINE__, device, "__offload_target_table_p1", 2,
- vd1, vd1g);
+ vd1, vd1g, NULL);
int table_size = num_funcs + 2 * num_vars;
if (table_size > 0)
@@ -254,7 +279,7 @@ get_target_table (int device, int &num_funcs, int &num_vars, void **&table)
VarDesc2 vd2g = { "table", 0 };
offload (__FILE__, __LINE__, device, "__offload_target_table_p2", 1,
- &vd2, &vd2g);
+ &vd2, &vd2g, NULL);
}
}
@@ -401,8 +426,8 @@ GOMP_OFFLOAD_alloc (int device, size_t size)
vd1[1].size = sizeof (void *);
VarDesc2 vd1g[2] = { { "size", 0 }, { "tgt_ptr", 0 } };
- offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g);
-
+ offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g,
+ NULL);
return tgt_ptr;
}
@@ -416,7 +441,8 @@ GOMP_OFFLOAD_free (int device, void *tgt_ptr)
vd1.size = sizeof (void *);
VarDesc2 vd1g = { "tgt_ptr", 0 };
- offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g);
+ offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g,
+ NULL);
}
extern "C" void *
@@ -435,7 +461,7 @@ GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const void *host_ptr,
VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p1", 2,
- vd1, vd1g);
+ vd1, vd1g, NULL);
VarDesc vd2 = vd_host2tgt;
vd2.ptr = (void *) host_ptr;
@@ -443,7 +469,7 @@ GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const void *host_ptr,
VarDesc2 vd2g = { "var", 0 };
offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p2", 1,
- &vd2, &vd2g);
+ &vd2, &vd2g, NULL);
return tgt_ptr;
}
@@ -464,7 +490,7 @@ GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const void *tgt_ptr,
VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p1", 2,
- vd1, vd1g);
+ vd1, vd1g, NULL);
VarDesc vd2 = vd_tgt2host;
vd2.ptr = (void *) host_ptr;
@@ -472,7 +498,7 @@ GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const void *tgt_ptr,
VarDesc2 vd2g = { "var", 0 };
offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p2", 1,
- &vd2, &vd2g);
+ &vd2, &vd2g, NULL);
return host_ptr;
}
@@ -495,22 +521,56 @@ GOMP_OFFLOAD_dev2dev (int device, void *dst_ptr, const void *src_ptr,
VarDesc2 vd1g[3] = { { "dst_ptr", 0 }, { "src_ptr", 0 }, { "size", 0 } };
offload (__FILE__, __LINE__, device, "__offload_target_tgt2tgt", 3, vd1,
- vd1g);
+ vd1g, NULL);
return dst_ptr;
}
extern "C" void
+GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
+ const void *async_data)
+{
+ TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p, async_data = %p)", device,
+ tgt_fn, tgt_vars, async_data);
+
+ VarDesc vd[2] = { vd_host2tgt, vd_host2tgt };
+ vd[0].ptr = &tgt_fn;
+ vd[0].size = sizeof (void *);
+ vd[1].ptr = &tgt_vars;
+ vd[1].size = sizeof (void *);
+
+ offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd, NULL,
+ async_data);
+}
+
+extern "C" void
GOMP_OFFLOAD_run (int device, void *tgt_fn, void *tgt_vars)
{
- TRACE ("(tgt_fn = %p, tgt_vars = %p)", tgt_fn, tgt_vars);
+ TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p)", device, tgt_fn, tgt_vars);
- VarDesc vd1[2] = { vd_host2tgt, vd_host2tgt };
- vd1[0].ptr = &tgt_fn;
- vd1[0].size = sizeof (void *);
- vd1[1].ptr = &tgt_vars;
- vd1[1].size = sizeof (void *);
- VarDesc2 vd1g[2] = { { "tgt_fn", 0 }, { "tgt_vars", 0 } };
+ GOMP_OFFLOAD_async_run (device, tgt_fn, tgt_vars, NULL);
+}
+
+extern "C" bool
+GOMP_OFFLOAD_async_is_completed (int device, const void *async_data)
+{
+ TRACE ("(device = %d, async_data = %p)", device, async_data);
+
+ bool res;
+ pthread_mutex_lock (&async_tasks_lock);
+ res = async_tasks->count (async_data) == 0;
+ pthread_mutex_unlock (&async_tasks_lock);
+ return res;
+}
+
+/* Called by liboffloadmic when asynchronous function is completed. */
+
+extern "C" void
+__gomp_offload_intelmic_async_completed (const void *async_data)
+{
+ TRACE ("(async_data = %p)", async_data);
- offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd1, vd1g);
+ pthread_mutex_lock (&async_tasks_lock);
+ async_tasks->erase (async_data);
+ pthread_mutex_unlock (&async_tasks_lock);
}
diff --git a/liboffloadmic/runtime/offload_host.cpp b/liboffloadmic/runtime/offload_host.cpp
index 08f626f..8cee12c 100644
--- a/liboffloadmic/runtime/offload_host.cpp
+++ b/liboffloadmic/runtime/offload_host.cpp
@@ -64,6 +64,9 @@ static void __offload_fini_library(void);
#define GET_OFFLOAD_NUMBER(timer_data) \
timer_data? timer_data->offload_number : 0
+extern "C" void
+__gomp_offload_intelmic_async_completed (const void *);
+
extern "C" {
#ifdef TARGET_WINNT
// Windows does not support imports from libraries without actually
@@ -2507,7 +2510,7 @@ extern "C" {
const void *info
)
{
- /* TODO: Call callback function, pass info. */
+ __gomp_offload_intelmic_async_completed (info);
}
}
-- Ilya