This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [gomp4.1] depend nowait support for target {update,{enter,exit} data}


Hi!

On Tue, Sep 08, 2015 at 11:20:14 +0200, Jakub Jelinek wrote:
> nowait support for #pragma omp target is not implemented yet, supposedly we
> need to mark those somehow (some flag) already in the struct gomp_task
> structure, essentially it will need either 2 or 3 callbacks
> (the current one, executed when the dependencies are resolved (it actually
> waits until some thread schedules it after that point, I think it is
> undesirable to run it with the tasking lock held), which would perform
> the gomp_map_vars and initiate the running of the region, and then some
> query routine which would poll the plugin whether the task is done or not,
> and either perform the finalization (unmap_vars) if it is done (and in any
> case return bool whether it should be polled again or not), and if the
> finalization is not done there, also another callback for the finalization.
> Also, there is the issue that if we are waiting for task that needs to be
> polled, and we don't have any further tasks to run, we shouldn't really
> attempt to sleep on some semaphore (e.g. in taskwait, end of
> taskgroup, etc.) or barrier, but rather either need to keep polling it, or
> call the query hook with some argument that it should sleep in there until
> the work is done by the offloading device.
> Also, there needs to be a way for the target nowait first callback to say
> that it is using host fallback and thus acts as a normal task, therefore
> once the task fn finishes, the task is done.

Here is my WIP patch.  target.c part is obviously incorrect, but it demonstrates
a possible libgomp <-> plugin interface for running a target task function
asynchronously and checking whether it is completed or not.
(Refactored liboffloadmic/runtime/emulator from trunk is required to run
target-tmp.c testcase.)


diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index d798321..8e2b5aa 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -872,6 +872,8 @@ struct gomp_device_descr
   void *(*host2dev_func) (int, void *, const void *, size_t);
   void *(*dev2dev_func) (int, void *, const void *, size_t);
   void (*run_func) (int, void *, void *);
+  void (*async_run_func) (int, void *, void *, const void *);
+  bool (*async_is_completed_func) (int, const void *);
 
   /* Splay tree containing information about mapped memory regions.  */
   struct splay_tree_s mem_map;
diff --git a/libgomp/target.c b/libgomp/target.c
index 77bd442..31f034c 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -45,6 +45,10 @@
 #include "plugin-suffix.h"
 #endif
 
+/* FIXME: TMP */
+#include <stdio.h>
+#include <unistd.h>
+
 static void gomp_target_init (void);
 
 /* The whole initialization code for offloading plugins is only run one.  */
@@ -1227,6 +1231,44 @@ gomp_target_fallback (void (*fn) (void *), void **hostaddrs)
   *thr = old_thr;
 }
 
+/* Host fallback with firstprivate map-type handling.  */
+
+static void
+gomp_target_fallback_firstprivate (void (*fn) (void *), size_t mapnum,
+				   void **hostaddrs, size_t *sizes,
+				   unsigned short *kinds)
+{
+  size_t i, tgt_align = 0, tgt_size = 0;
+  char *tgt = NULL;
+  for (i = 0; i < mapnum; i++)
+    if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+      {
+	size_t align = (size_t) 1 << (kinds[i] >> 8);
+	if (tgt_align < align)
+	  tgt_align = align;
+	tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	tgt_size += sizes[i];
+      }
+  if (tgt_align)
+    {
+      tgt = gomp_alloca (tgt_size + tgt_align - 1);
+      uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
+      if (al)
+	tgt += tgt_align - al;
+      tgt_size = 0;
+      for (i = 0; i < mapnum; i++)
+	if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+	  {
+	    size_t align = (size_t) 1 << (kinds[i] >> 8);
+	    tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	    memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
+	    hostaddrs[i] = tgt + tgt_size;
+	    tgt_size = tgt_size + sizes[i];
+	  }
+    }
+  gomp_target_fallback (fn, hostaddrs);
+}
+
 /* Helper function of GOMP_target{,_41} routines.  */
 
 static void *
@@ -1311,40 +1353,19 @@ GOMP_target_41 (int device, void (*fn) (void *), size_t mapnum,
   if (devicep == NULL
       || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
     {
-      size_t i, tgt_align = 0, tgt_size = 0;
-      char *tgt = NULL;
-      for (i = 0; i < mapnum; i++)
-	if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
-	  {
-	    size_t align = (size_t) 1 << (kinds[i] >> 8);
-	    if (tgt_align < align)
-	      tgt_align = align;
-	    tgt_size = (tgt_size + align - 1) & ~(align - 1);
-	    tgt_size += sizes[i];
-	  }
-      if (tgt_align)
-	{
-	  tgt = gomp_alloca (tgt_size + tgt_align - 1);
-	  uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
-	  if (al)
-	    tgt += tgt_align - al;
-	  tgt_size = 0;
-	  for (i = 0; i < mapnum; i++)
-	    if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
-	      {
-		size_t align = (size_t) 1 << (kinds[i] >> 8);
-		tgt_size = (tgt_size + align - 1) & ~(align - 1);
-		memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
-		hostaddrs[i] = tgt + tgt_size;
-		tgt_size = tgt_size + sizes[i];
-	      }
-	}
-      gomp_target_fallback (fn, hostaddrs);
+      gomp_target_fallback_firstprivate (fn, mapnum, hostaddrs, sizes, kinds);
       return;
     }
 
   void *fn_addr = gomp_get_target_fn_addr (devicep, fn);
 
+  if (flags & GOMP_TARGET_FLAG_NOWAIT)
+    {
+      gomp_create_target_task (devicep, fn_addr, mapnum, hostaddrs, sizes,
+			       kinds, flags, depend);
+      return;
+    }
+
   struct target_mem_desc *tgt_vars
     = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, true,
 		     GOMP_MAP_VARS_TARGET);
@@ -1636,34 +1657,58 @@ void
 gomp_target_task_fn (void *data)
 {
   struct gomp_target_task *ttask = (struct gomp_target_task *) data;
+  struct gomp_device_descr *devicep = ttask->devicep;
+
   if (ttask->fn != NULL)
     {
-      /* GOMP_target_41 */
+      if (devicep == NULL
+	  || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+	{
+	  /* FIXME: Save host fn addr into gomp_target_task?  */
+	  gomp_target_fallback_firstprivate (NULL, ttask->mapnum,
+					     ttask->hostaddrs, ttask->sizes,
+					     ttask->kinds);
+	  return;
+	}
+
+      struct target_mem_desc *tgt_vars
+	= gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs, NULL,
+			 ttask->sizes, ttask->kinds, true,
+			 GOMP_MAP_VARS_TARGET);
+      devicep->async_run_func (devicep->target_id, ttask->fn,
+			       (void *) tgt_vars->tgt_start, data);
+
+      /* FIXME: TMP example of checking for completion.
+	 Alternatively the plugin can set some completion flag in ttask.  */
+      while (!devicep->async_is_completed_func (devicep->target_id, data))
+	{
+	  fprintf (stderr, "-");
+	  usleep (100000);
+	}
     }
-  else if (ttask->devicep == NULL
-	   || !(ttask->devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+  else if (devicep == NULL
+	   || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
     return;
 
   size_t i;
   if (ttask->flags & GOMP_TARGET_FLAG_UPDATE)
-    gomp_update (ttask->devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+    gomp_update (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
 		 ttask->kinds, true);
   else if ((ttask->flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
     for (i = 0; i < ttask->mapnum; i++)
       if ((ttask->kinds[i] & 0xff) == GOMP_MAP_STRUCT)
 	{
-	  gomp_map_vars (ttask->devicep, ttask->sizes[i] + 1,
-			 &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
-			 &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
+	  gomp_map_vars (devicep, ttask->sizes[i] + 1, &ttask->hostaddrs[i],
+			 NULL, &ttask->sizes[i], &ttask->kinds[i], true,
+			 GOMP_MAP_VARS_ENTER_DATA);
 	  i += ttask->sizes[i];
 	}
       else
-	gomp_map_vars (ttask->devicep, 1, &ttask->hostaddrs[i], NULL,
-		       &ttask->sizes[i], &ttask->kinds[i],
-		       true, GOMP_MAP_VARS_ENTER_DATA);
+	gomp_map_vars (devicep, 1, &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
+		       &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
   else
-    gomp_exit_data (ttask->devicep, ttask->mapnum, ttask->hostaddrs,
-		    ttask->sizes, ttask->kinds);
+    gomp_exit_data (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+		    ttask->kinds);
 }
 
 void
@@ -2108,6 +2153,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
     {
       DLSYM (run);
+      DLSYM (async_run);
+      DLSYM (async_is_completed);
       DLSYM (dev2dev);
     }
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
diff --git a/libgomp/testsuite/libgomp.c/target-tmp.c b/libgomp/testsuite/libgomp.c/target-tmp.c
new file mode 100644
index 0000000..23a739c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-tmp.c
@@ -0,0 +1,40 @@
+#include <stdio.h>
+#include <unistd.h>
+
+#pragma omp declare target
+void foo (int n)
+{
+  printf ("Start tgt %d\n", n);
+  usleep (5000000);
+  printf ("End tgt %d\n", n);
+}
+#pragma omp end declare target
+
+int x, y, z;
+
+int main ()
+{
+  #pragma omp parallel
+  #pragma omp single
+    {
+      #pragma omp task depend(out: x)
+      printf ("Host task\n");
+
+      #pragma omp target nowait depend(in: x) depend(out: y)
+      foo (1);
+
+      #pragma omp target nowait depend(in: y)
+      foo (2);
+
+      #pragma omp target nowait depend(in: y)
+      foo (3);
+
+      while (1)
+	{
+	  usleep (333333);
+	  fprintf (stderr, ".");
+	}
+    }
+
+  return 0;
+}
diff --git a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
index 26ac6fe..c843710 100644
--- a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
+++ b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
@@ -34,6 +34,7 @@
 #include <string.h>
 #include <utility>
 #include <vector>
+#include <set>
 #include <map>
 #include "libgomp-plugin.h"
 #include "compiler_if_host.h"
@@ -76,9 +77,15 @@ static int num_images;
    second key is number of device.  Contains a vector of pointer pairs.  */
 static ImgDevAddrMap *address_table;
 
+/* Set of asynchronously running target tasks.  */
+static std::set<const void *> *async_tasks;
+
 /* Thread-safe registration of the main image.  */
 static pthread_once_t main_image_is_registered = PTHREAD_ONCE_INIT;
 
+/* Mutex for protecting async_tasks.  */
+static pthread_mutex_t async_tasks_lock = PTHREAD_MUTEX_INITIALIZER;
+
 static VarDesc vd_host2tgt = {
   { 1, 1 },		      /* dst, src			      */
   { 1, 0 },		      /* in, out			      */
@@ -156,6 +163,8 @@ init (void)
 
 out:
   address_table = new ImgDevAddrMap;
+  async_tasks = new std::set<const void *>;
+  pthread_mutex_init (&async_tasks_lock, NULL);
   num_devices = _Offload_number_of_devices ();
 }
 
@@ -192,11 +201,27 @@ GOMP_OFFLOAD_get_num_devices (void)
 
 static void
 offload (const char *file, uint64_t line, int device, const char *name,
-	 int num_vars, VarDesc *vars, VarDesc2 *vars2)
+	 int num_vars, VarDesc *vars, VarDesc2 *vars2, const void *async_data)
 {
   OFFLOAD ofld = __offload_target_acquire1 (&device, file, line);
   if (ofld)
-    __offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL, NULL);
+    {
+      if (async_data == NULL)
+	__offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL,
+			    NULL);
+      else
+	{
+	  pthread_mutex_lock (&async_tasks_lock);
+	  async_tasks->insert (async_data);
+	  pthread_mutex_unlock (&async_tasks_lock);
+
+	  OffloadFlags flags;
+	  flags.flags = 0;
+	  flags.bits.omp_async = 1;
+	  __offload_offload3 (ofld, name, 0, num_vars, vars, NULL, 0, NULL,
+			      (const void **) async_data, 0, NULL, flags, NULL);
+	}
+    }
   else
     {
       fprintf (stderr, "%s:%d: Offload target acquire failed\n", file, line);
@@ -218,7 +243,7 @@ GOMP_OFFLOAD_init_device (int device)
   TRACE ("");
   pthread_once (&main_image_is_registered, register_main_image);
   offload (__FILE__, __LINE__, device, "__offload_target_init_proc", 0,
-	   NULL, NULL);
+	   NULL, NULL, NULL);
 }
 
 extern "C" void
@@ -240,7 +265,7 @@ get_target_table (int device, int &num_funcs, int &num_vars, void **&table)
   VarDesc2 vd1g[2] = { { "num_funcs", 0 }, { "num_vars", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_table_p1", 2,
-	   vd1, vd1g);
+	   vd1, vd1g, NULL);
 
   int table_size = num_funcs + 2 * num_vars;
   if (table_size > 0)
@@ -254,7 +279,7 @@ get_target_table (int device, int &num_funcs, int &num_vars, void **&table)
       VarDesc2 vd2g = { "table", 0 };
 
       offload (__FILE__, __LINE__, device, "__offload_target_table_p2", 1,
-	       &vd2, &vd2g);
+	       &vd2, &vd2g, NULL);
     }
 }
 
@@ -401,8 +426,8 @@ GOMP_OFFLOAD_alloc (int device, size_t size)
   vd1[1].size = sizeof (void *);
   VarDesc2 vd1g[2] = { { "size", 0 }, { "tgt_ptr", 0 } };
 
-  offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g);
-
+  offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g,
+	   NULL);
   return tgt_ptr;
 }
 
@@ -416,7 +441,8 @@ GOMP_OFFLOAD_free (int device, void *tgt_ptr)
   vd1.size = sizeof (void *);
   VarDesc2 vd1g = { "tgt_ptr", 0 };
 
-  offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g);
+  offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g,
+	   NULL);
 }
 
 extern "C" void *
@@ -435,7 +461,7 @@ GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const void *host_ptr,
   VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p1", 2,
-	   vd1, vd1g);
+	   vd1, vd1g, NULL);
 
   VarDesc vd2 = vd_host2tgt;
   vd2.ptr = (void *) host_ptr;
@@ -443,7 +469,7 @@ GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const void *host_ptr,
   VarDesc2 vd2g = { "var", 0 };
 
   offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p2", 1,
-	   &vd2, &vd2g);
+	   &vd2, &vd2g, NULL);
 
   return tgt_ptr;
 }
@@ -464,7 +490,7 @@ GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const void *tgt_ptr,
   VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p1", 2,
-	   vd1, vd1g);
+	   vd1, vd1g, NULL);
 
   VarDesc vd2 = vd_tgt2host;
   vd2.ptr = (void *) host_ptr;
@@ -472,7 +498,7 @@ GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const void *tgt_ptr,
   VarDesc2 vd2g = { "var", 0 };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p2", 1,
-	   &vd2, &vd2g);
+	   &vd2, &vd2g, NULL);
 
   return host_ptr;
 }
@@ -495,22 +521,56 @@ GOMP_OFFLOAD_dev2dev (int device, void *dst_ptr, const void *src_ptr,
   VarDesc2 vd1g[3] = { { "dst_ptr", 0 }, { "src_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2tgt", 3, vd1,
-	   vd1g);
+	   vd1g, NULL);
 
   return dst_ptr;
 }
 
 extern "C" void
+GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
+			const void *async_data)
+{
+  TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p, async_data = %p)", device,
+	 tgt_fn, tgt_vars, async_data);
+
+  VarDesc vd[2] = { vd_host2tgt, vd_host2tgt };
+  vd[0].ptr = &tgt_fn;
+  vd[0].size = sizeof (void *);
+  vd[1].ptr = &tgt_vars;
+  vd[1].size = sizeof (void *);
+
+  offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd, NULL,
+	   async_data);
+}
+
+extern "C" void
 GOMP_OFFLOAD_run (int device, void *tgt_fn, void *tgt_vars)
 {
-  TRACE ("(tgt_fn = %p, tgt_vars = %p)", tgt_fn, tgt_vars);
+  TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p)", device, tgt_fn, tgt_vars);
 
-  VarDesc vd1[2] = { vd_host2tgt, vd_host2tgt };
-  vd1[0].ptr = &tgt_fn;
-  vd1[0].size = sizeof (void *);
-  vd1[1].ptr = &tgt_vars;
-  vd1[1].size = sizeof (void *);
-  VarDesc2 vd1g[2] = { { "tgt_fn", 0 }, { "tgt_vars", 0 } };
+  GOMP_OFFLOAD_async_run (device, tgt_fn, tgt_vars, NULL);
+}
+
+extern "C" bool
+GOMP_OFFLOAD_async_is_completed (int device, const void *async_data)
+{
+  TRACE ("(device = %d, async_data = %p)", device, async_data);
+
+  bool res;
+  pthread_mutex_lock (&async_tasks_lock);
+  res = async_tasks->count (async_data) == 0;
+  pthread_mutex_unlock (&async_tasks_lock);
+  return res;
+}
+
+/* Called by liboffloadmic when asynchronous function is completed.  */
+
+extern "C" void
+__gomp_offload_intelmic_async_completed (const void *async_data)
+{
+  TRACE ("(async_data = %p)", async_data);
 
-  offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd1, vd1g);
+  pthread_mutex_lock (&async_tasks_lock);
+  async_tasks->erase (async_data);
+  pthread_mutex_unlock (&async_tasks_lock);
 }
diff --git a/liboffloadmic/runtime/offload_host.cpp b/liboffloadmic/runtime/offload_host.cpp
index 08f626f..8cee12c 100644
--- a/liboffloadmic/runtime/offload_host.cpp
+++ b/liboffloadmic/runtime/offload_host.cpp
@@ -64,6 +64,9 @@ static void __offload_fini_library(void);
 #define GET_OFFLOAD_NUMBER(timer_data) \
     timer_data? timer_data->offload_number : 0
 
+extern "C" void
+__gomp_offload_intelmic_async_completed (const void *);
+
 extern "C" {
 #ifdef TARGET_WINNT
 // Windows does not support imports from libraries without actually
@@ -2507,7 +2510,7 @@ extern "C" {
         const void *info
     )
     {
-	/* TODO: Call callback function, pass info.  */
+	__gomp_offload_intelmic_async_completed (info);
     }
 }
 

  -- Ilya


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]