This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [libgomp, GSoC'19] Work-stealing task scheduling


On Mon, Aug 26, 2019 at 02:58:46PM +0900, Ray Kim wrote:
> Fixed typos, lowered memory constraints where appropriate.

Your mailer sadly made the patch not applicable, I had to spent quite a long
time to just undo the undesirable line breaking.  Please fix your mailer not
to break lines automatically, or at least send patches as attachments on
which hopefully it will not do that on.

Before starting review, I've also fixed various formatting glitches.

Here is the fixed up patch (and attached interdiff with the changes I've
done):

--- libgomp/config/linux/bar.c.jj	2019-09-02 12:24:54.900952708 +0200
+++ libgomp/config/linux/bar.c	2019-09-02 12:25:37.446319594 +0200
@@ -199,13 +199,13 @@ gomp_team_barrier_wait_cancel (gomp_barr
 void
 gomp_team_barrier_cancel (struct gomp_team *team)
 {
-  gomp_mutex_lock (&team->task_lock);
+  gomp_mutex_lock (&team->barrier_lock);
   if (team->barrier.generation & BAR_CANCELLED)
     {
-      gomp_mutex_unlock (&team->task_lock);
+      gomp_mutex_unlock (&team->barrier_lock);
       return;
     }
   team->barrier.generation |= BAR_CANCELLED;
-  gomp_mutex_unlock (&team->task_lock);
+  gomp_mutex_unlock (&team->barrier_lock);
   futex_wake ((int *) &team->barrier.generation, INT_MAX);
 }
--- libgomp/task.c.jj	2019-01-01 12:38:37.977648900 +0100
+++ libgomp/task.c	2019-09-02 12:54:27.879483614 +0200
@@ -29,6 +29,7 @@
 #include "libgomp.h"
 #include <stdlib.h>
 #include <string.h>
+#include <math.h>
 #include "gomp-constants.h"
 
 typedef struct gomp_task_depend_entry *hash_entry_type;
@@ -76,16 +77,21 @@ gomp_init_task (struct gomp_task *task,
   task->parent = parent_task;
   task->icv = *prev_icv;
   task->kind = GOMP_TASK_IMPLICIT;
-  task->taskwait = NULL;
+  task->state = GOMP_STATE_NORMAL;
   task->in_tied_task = false;
   task->final_task = false;
   task->copy_ctors_done = false;
   task->parent_depends_on = false;
-  priority_queue_init (&task->children_queue);
   task->taskgroup = NULL;
   task->dependers = NULL;
   task->depend_hash = NULL;
   task->depend_count = 0;
+  task->num_awaited = 0;
+  task->num_children = 0;
+  /* Currently we're initializing the depend lock for every tasks.
+     However, it coulbd be possible that the mutex can be initialized
+     on-demand by the dependers.  */
+  gomp_mutex_init (&task->depend_lock);
 }
 
 /* Clean up a task, after completing it.  */
@@ -100,61 +106,115 @@ gomp_end_task (void)
   thr->task = task->parent;
 }
 
-/* Clear the parent field of every task in LIST.  */
+/* If task is free to go, clean up the task.  */
 
-static inline void
-gomp_clear_parent_in_list (struct priority_list *list)
+void
+gomp_maybe_end_task (void)
 {
-  struct priority_node *p = list->tasks;
-  if (p)
-    do
-      {
-	priority_node_to_task (PQ_CHILDREN, p)->parent = NULL;
-	p = p->next;
-      }
-    while (p != list->tasks);
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_task *task = thr->task;
+
+  if (__atomic_load_n (&task->num_children, MEMMODEL_ACQUIRE) == 0)
+    gomp_finish_task (task);
+  thr->task = task->parent;
 }
 
-/* Splay tree version of gomp_clear_parent_in_list.
+/* Enqueue the task to its corresponding task queue.
 
-   Clear the parent field of every task in NODE within SP, and free
-   the node when done.  */
+   Currently, the 'corresponding task queue' is the queue dedicated to the
+   calling thread.  team and thread are passed as an optimization.  */
 
-static void
-gomp_clear_parent_in_tree (prio_splay_tree sp, prio_splay_tree_node node)
+void
+gomp_enqueue_task (struct gomp_task *task, struct gomp_team *team,
+		   struct gomp_thread *thr, int priority)
 {
-  if (!node)
-    return;
-  prio_splay_tree_node left = node->left, right = node->right;
-  gomp_clear_parent_in_list (&node->key.l);
+  int tid = thr->ts.team_id;
+  struct gomp_taskqueue *queue = &gomp_team_taskqueue (team)[tid];
+
+  __atomic_add_fetch (&team->task_queued_count, 1, MEMMODEL_ACQ_REL);
+  gomp_mutex_lock (&queue->queue_lock);
+  priority_queue_insert (&queue->priority_queue, task, priority,
+			 PRIORITY_INSERT_END, task->parent_depends_on);
+  gomp_mutex_unlock (&queue->queue_lock);
+  return;
+}
+
+/* Dequeue a task from the specific queue.  */
+
+inline static struct gomp_task *
+gomp_dequeue_task_from_queue (int qid, struct gomp_team *team)
+{
+  struct gomp_task *task = NULL;
+  struct gomp_taskqueue *queue = &gomp_team_taskqueue (team)[qid];
+  gomp_mutex_lock (&queue->queue_lock);
+
 #if _LIBGOMP_CHECKING_
-  memset (node, 0xaf, sizeof (*node));
+  priority_queue_verify (&queue->priority_queue, false);
 #endif
-  /* No need to remove the node from the tree.  We're nuking
-     everything, so just free the nodes and our caller can clear the
-     entire splay tree.  */
-  free (node);
-  gomp_clear_parent_in_tree (sp, left);
-  gomp_clear_parent_in_tree (sp, right);
+
+  if (priority_queue_empty_p (&queue->priority_queue, MEMMODEL_RELAXED))
+    {
+      gomp_mutex_unlock (&queue->queue_lock);
+      return NULL;
+    }
+
+  task = priority_queue_next_task (&queue->priority_queue);
+
+  if (__atomic_load_n (&task->kind, MEMMODEL_ACQUIRE) != GOMP_TASK_WAITING)
+    {
+      gomp_mutex_unlock (&queue->queue_lock);
+      return NULL;
+    }
+
+  priority_queue_remove (&queue->priority_queue, task, MEMMODEL_RELAXED);
+
+  task->pnode.next = NULL;
+  task->pnode.prev = NULL;
+  gomp_mutex_unlock (&queue->queue_lock);
+  return task;
 }
 
-/* Clear the parent field of every task in Q and remove every task
-   from Q.  */
+/* Dequeue a task from the thread's dedicated queue or steal a task from
+   another queue.  */
 
-static inline void
-gomp_clear_parent (struct priority_queue *q)
+struct gomp_task *
+gomp_dequeue_task (struct gomp_team *team, struct gomp_thread *thr)
 {
-  if (priority_queue_multi_p (q))
+  if (__atomic_load_n (&team->task_queued_count, MEMMODEL_ACQUIRE) == 0)
+    return NULL;
+
+  int qid = thr->ts.team_id;
+  struct gomp_task *task = NULL;
+
+  task = gomp_dequeue_task_from_queue (qid, team);
+  while (task == NULL)
     {
-      gomp_clear_parent_in_tree (&q->t, q->t.root);
-      /* All the nodes have been cleared in gomp_clear_parent_in_tree.
-	 No need to remove anything.  We can just nuke everything.  */
-      q->t.root = NULL;
+      if (__atomic_load_n (&team->task_queued_count,  MEMMODEL_ACQUIRE) == 0)
+	return NULL;
+
+      /* Uniform probability work Stealing
+
+	 math.h rand is not the best prng in the world but is simple enough
+	 for our purpose.  */
+
+      qid = rand () % team->nthreads;
+      task = gomp_dequeue_task_from_queue (qid, team);
     }
-  else
-    gomp_clear_parent_in_list (&q->l);
+
+  if (__atomic_sub_fetch (&team->task_queued_count, 1, MEMMODEL_ACQUIRE) == 0)
+    {
+      /* Deadlock is not possible since the queue lock is not held in a
+	 barrier_lock region.  */
+      gomp_mutex_lock (&team->barrier_lock);
+      gomp_team_barrier_clear_task_pending (&team->barrier);
+      gomp_mutex_unlock (&team->barrier_lock);
+      return task;
+    }
+
+  return task;
 }
 
+
 /* Helper function for GOMP_task and gomp_create_target_task.
 
    For a TASK with in/out dependencies, fill in the various dependency
@@ -225,6 +285,9 @@ gomp_task_handle_depend (struct gomp_tas
     }
   task->depend_count = ndepend;
   task->num_dependees = 0;
+
+  gomp_mutex_lock (&parent->depend_lock);
+
   if (parent->depend_hash == NULL)
     parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
   for (i = 0; i < ndepend; i++)
@@ -324,6 +387,7 @@ gomp_task_handle_depend (struct gomp_tas
 	  out->redundant_out = true;
 	}
     }
+  gomp_mutex_unlock (&parent->depend_lock);
 }
 
 /* Called when encountering an explicit task directive.  If IF_CLAUSE is
@@ -369,11 +433,13 @@ GOMP_task (void (*fn) (void *), void *da
 	return;
       if (thr->task->taskgroup)
 	{
-	  if (thr->task->taskgroup->cancelled)
+	  if (__atomic_load_n (&thr->task->taskgroup->cancelled,
+			       MEMMODEL_ACQUIRE))
 	    return;
 	  if (thr->task->taskgroup->workshare
 	      && thr->task->taskgroup->prev
-	      && thr->task->taskgroup->prev->cancelled)
+	      && __atomic_load_n (&thr->task->taskgroup->prev->cancelled,
+				  MEMMODEL_ACQUIRE))
 	    return;
 	}
     }
@@ -383,9 +449,11 @@ GOMP_task (void (*fn) (void *), void *da
   else if (priority > gomp_max_task_priority_var)
     priority = gomp_max_task_priority_var;
 
-  if (!if_clause || team == NULL
+  if (!if_clause
+      || team == NULL
       || (thr->task && thr->task->final_task)
-      || team->task_count > 64 * team->nthreads)
+      || __atomic_load_n (&team->task_count, MEMMODEL_RELAXED)
+	 > 64 * team->nthreads)
     {
       struct gomp_task task;
 
@@ -396,7 +464,8 @@ GOMP_task (void (*fn) (void *), void *da
 	 the parent task is suspended until the child task finishes and thus
 	 it can't start further child tasks.  */
       if ((flags & GOMP_TASK_FLAG_DEPEND)
-	  && thr->task && thr->task->depend_hash)
+	  && thr->task
+	  && thr->task->depend_hash)
 	gomp_task_maybe_wait_for_dependencies (depend);
 
       gomp_init_task (&task, thr->task, gomp_icv (false));
@@ -420,22 +489,7 @@ GOMP_task (void (*fn) (void *), void *da
 	}
       else
 	fn (data);
-      /* Access to "children" is normally done inside a task_lock
-	 mutex region, but the only way this particular task.children
-	 can be set is if this thread's task work function (fn)
-	 creates children.  So since the setter is *this* thread, we
-	 need no barriers here when testing for non-NULL.  We can have
-	 task.children set by the current thread then changed by a
-	 child thread, but seeing a stale non-NULL value is not a
-	 problem.  Once past the task_lock acquisition, this thread
-	 will see the real value of task.children.  */
-      if (!priority_queue_empty_p (&task.children_queue, MEMMODEL_RELAXED))
-	{
-	  gomp_mutex_lock (&team->task_lock);
-	  gomp_clear_parent (&task.children_queue);
-	  gomp_mutex_unlock (&team->task_lock);
-	}
-      gomp_end_task ();
+      gomp_maybe_end_task ();
     }
   else
     {
@@ -443,7 +497,6 @@ GOMP_task (void (*fn) (void *), void *da
       struct gomp_task *parent = thr->task;
       struct gomp_taskgroup *taskgroup = parent->taskgroup;
       char *arg;
-      bool do_wake;
       size_t depend_size = 0;
 
       if (flags & GOMP_TASK_FLAG_DEPEND)
@@ -471,32 +524,32 @@ GOMP_task (void (*fn) (void *), void *da
       task->fn = fn;
       task->fn_data = arg;
       task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1;
-      gomp_mutex_lock (&team->task_lock);
       /* If parallel or taskgroup has been cancelled, don't start new
 	 tasks.  */
-      if (__builtin_expect (gomp_cancel_var, 0)
-	  && !task->copy_ctors_done)
+      if (__builtin_expect (gomp_cancel_var, 0) && !task->copy_ctors_done)
 	{
 	  if (gomp_team_barrier_cancelled (&team->barrier))
 	    {
 	    do_cancel:
-	      gomp_mutex_unlock (&team->task_lock);
 	      gomp_finish_task (task);
 	      free (task);
 	      return;
 	    }
 	  if (taskgroup)
 	    {
-	      if (taskgroup->cancelled)
+	      if (__atomic_load_n (&taskgroup->cancelled, MEMMODEL_ACQUIRE))
 		goto do_cancel;
 	      if (taskgroup->workshare
 		  && taskgroup->prev
-		  && taskgroup->prev->cancelled)
+		  && __atomic_load_n (&taskgroup->prev->cancelled,
+				      MEMMODEL_ACQUIRE))
 		goto do_cancel;
 	    }
 	}
+
       if (taskgroup)
-	taskgroup->num_children++;
+	__atomic_add_fetch (&taskgroup->num_children, 1, MEMMODEL_ACQ_REL);
+
       if (depend_size)
 	{
 	  gomp_task_handle_depend (task, parent, depend);
@@ -509,36 +562,21 @@ GOMP_task (void (*fn) (void *), void *da
 		 dependencies have been satisfied.  After which, they
 		 can be picked up by the various scheduling
 		 points.  */
-	      gomp_mutex_unlock (&team->task_lock);
 	      return;
 	    }
 	}
 
-      priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
-			     task, priority,
-			     PRIORITY_INSERT_BEGIN,
-			     /*adjust_parent_depends_on=*/false,
-			     task->parent_depends_on);
-      if (taskgroup)
-	priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-			       task, priority,
-			       PRIORITY_INSERT_BEGIN,
-			       /*adjust_parent_depends_on=*/false,
-			       task->parent_depends_on);
-
-      priority_queue_insert (PQ_TEAM, &team->task_queue,
-			     task, priority,
-			     PRIORITY_INSERT_END,
-			     /*adjust_parent_depends_on=*/false,
-			     task->parent_depends_on);
+      __atomic_add_fetch (&team->task_count, 1, MEMMODEL_ACQ_REL);
+      __atomic_add_fetch (&parent->num_children, 1, MEMMODEL_ACQ_REL);
+      gomp_enqueue_task (task, team, thr, task->priority);
 
-      ++team->task_count;
-      ++team->task_queued_count;
+      gomp_mutex_lock (&team->barrier_lock);
       gomp_team_barrier_set_task_pending (&team->barrier);
-      do_wake = team->task_running_count + !parent->in_tied_task
-		< team->nthreads;
-      gomp_mutex_unlock (&team->task_lock);
-      if (do_wake)
+      gomp_mutex_unlock (&team->barrier_lock);
+
+      if (__atomic_load_n (&team->task_running_count, MEMMODEL_ACQUIRE)
+	  + !__atomic_load_n (&parent->in_tied_task, MEMMODEL_ACQUIRE)
+	  < team->nthreads)
 	gomp_team_barrier_wake (&team->barrier, 1);
     }
 }
@@ -563,87 +601,29 @@ ialias (GOMP_taskgroup_reduction_registe
 #undef UTYPE
 #undef GOMP_taskloop
 
-static void inline
-priority_queue_move_task_first (enum priority_queue_type type,
-				struct priority_queue *head,
-				struct gomp_task *task)
-{
-#if _LIBGOMP_CHECKING_
-  if (!priority_queue_task_in_queue_p (type, head, task))
-    gomp_fatal ("Attempt to move first missing task %p", task);
-#endif
-  struct priority_list *list;
-  if (priority_queue_multi_p (head))
-    {
-      list = priority_queue_lookup_priority (head, task->priority);
-#if _LIBGOMP_CHECKING_
-      if (!list)
-	gomp_fatal ("Unable to find priority %d", task->priority);
-#endif
-    }
-  else
-    list = &head->l;
-  priority_list_remove (list, task_to_priority_node (type, task), 0);
-  priority_list_insert (type, list, task, task->priority,
-			PRIORITY_INSERT_BEGIN, type == PQ_CHILDREN,
-			task->parent_depends_on);
-}
-
-/* Actual body of GOMP_PLUGIN_target_task_completion that is executed
-   with team->task_lock held, or is executed in the thread that called
-   gomp_target_task_fn if GOMP_PLUGIN_target_task_completion has been
-   run before it acquires team->task_lock.  */
-
-static void
-gomp_target_task_completion (struct gomp_team *team, struct gomp_task *task)
+  /* Actual body of GOMP_PLUGIN_target_task_completion that is executed
+     with team->task_lock held, or is executed in the thread that called
+     gomp_target_task_fn if GOMP_PLUGIN_target_task_completion has been
+     run before it acquires team->task_lock.  */
+
+static void gomp_target_task_completion (struct gomp_team *team,
+					 struct gomp_task *task,
+					 struct gomp_thread *thread)
 {
-  struct gomp_task *parent = task->parent;
-  if (parent)
-    priority_queue_move_task_first (PQ_CHILDREN, &parent->children_queue,
-				    task);
-
-  struct gomp_taskgroup *taskgroup = task->taskgroup;
-  if (taskgroup)
-    priority_queue_move_task_first (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-				    task);
-
-  priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority,
-			 PRIORITY_INSERT_BEGIN, false,
-			 task->parent_depends_on);
-  task->kind = GOMP_TASK_WAITING;
-  if (parent && parent->taskwait)
-    {
-      if (parent->taskwait->in_taskwait)
-	{
-	  /* One more task has had its dependencies met.
-	     Inform any waiters.  */
-	  parent->taskwait->in_taskwait = false;
-	  gomp_sem_post (&parent->taskwait->taskwait_sem);
-	}
-      else if (parent->taskwait->in_depend_wait)
-	{
-	  /* One more task has had its dependencies met.
-	     Inform any waiters.  */
-	  parent->taskwait->in_depend_wait = false;
-	  gomp_sem_post (&parent->taskwait->taskwait_sem);
-	}
-    }
-  if (taskgroup && taskgroup->in_taskgroup_wait)
-    {
-      /* One more task has had its dependencies met.
-	 Inform any waiters.  */
-      taskgroup->in_taskgroup_wait = false;
-      gomp_sem_post (&taskgroup->taskgroup_sem);
-    }
+  __atomic_store_n (&task->kind, GOMP_TASK_WAITING, MEMMODEL_RELEASE);
+  gomp_enqueue_task (task, team, thread, task->priority);
 
-  ++team->task_queued_count;
+  gomp_mutex_lock (&team->barrier_lock);
   gomp_team_barrier_set_task_pending (&team->barrier);
-  /* I'm afraid this can't be done after releasing team->task_lock,
+
+  /* I'm afraid this can't be done after releasing team->barrier_lock,
      as gomp_target_task_completion is run from unrelated thread and
      therefore in between gomp_mutex_unlock and gomp_team_barrier_wake
      the team could be gone already.  */
-  if (team->nthreads > team->task_running_count)
+  if (team->nthreads
+      > __atomic_load_n (&team->task_running_count, MEMMODEL_ACQUIRE))
     gomp_team_barrier_wake (&team->barrier, 1);
+  gomp_mutex_unlock (&team->barrier_lock);
 }
 
 /* Signal that a target task TTASK has completed the asynchronously
@@ -657,16 +637,10 @@ GOMP_PLUGIN_target_task_completion (void
   struct gomp_task *task = ttask->task;
   struct gomp_team *team = ttask->team;
 
-  gomp_mutex_lock (&team->task_lock);
-  if (ttask->state == GOMP_TARGET_TASK_READY_TO_RUN)
-    {
-      ttask->state = GOMP_TARGET_TASK_FINISHED;
-      gomp_mutex_unlock (&team->task_lock);
-      return;
-    }
-  ttask->state = GOMP_TARGET_TASK_FINISHED;
-  gomp_target_task_completion (team, task);
-  gomp_mutex_unlock (&team->task_lock);
+  if (__atomic_exchange_n (&ttask->state, GOMP_TARGET_TASK_FINISHED,
+			   MEMMODEL_ACQ_REL) == GOMP_TARGET_TASK_READY_TO_RUN)
+    return;
+  gomp_target_task_completion (team, task, gomp_thread ());
 }
 
 static void gomp_task_run_post_handle_depend_hash (struct gomp_task *);
@@ -674,10 +648,10 @@ static void gomp_task_run_post_handle_de
 /* Called for nowait target tasks.  */
 
 bool
-gomp_create_target_task (struct gomp_device_descr *devicep,
-			 void (*fn) (void *), size_t mapnum, void **hostaddrs,
-			 size_t *sizes, unsigned short *kinds,
-			 unsigned int flags, void **depend, void **args,
+gomp_create_target_task (struct gomp_device_descr *devicep, void (*fn) (void *),
+			 size_t mapnum, void **hostaddrs, size_t *sizes,
+			 unsigned short *kinds, unsigned int flags,
+			 void **depend, void **args,
 			 enum gomp_target_task_state state)
 {
   struct gomp_thread *thr = gomp_thread ();
@@ -690,11 +664,13 @@ gomp_create_target_task (struct gomp_dev
 	return true;
       if (thr->task->taskgroup)
 	{
-	  if (thr->task->taskgroup->cancelled)
+	  if (__atomic_load_n (&thr->task->taskgroup->cancelled,
+			       MEMMODEL_ACQUIRE))
 	    return true;
 	  if (thr->task->taskgroup->workshare
 	      && thr->task->taskgroup->prev
-	      && thr->task->taskgroup->prev->cancelled)
+	      && __atomic_load_n (&thr->task->taskgroup->prev->cancelled,
+				  MEMMODEL_ACQUIRE))
 	    return true;
 	}
     }
@@ -778,331 +754,116 @@ gomp_create_target_task (struct gomp_dev
   task->fn = NULL;
   task->fn_data = ttask;
   task->final_task = 0;
-  gomp_mutex_lock (&team->task_lock);
   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
   if (__builtin_expect (gomp_cancel_var, 0))
     {
       if (gomp_team_barrier_cancelled (&team->barrier))
 	{
 	do_cancel:
-	  gomp_mutex_unlock (&team->task_lock);
 	  gomp_finish_task (task);
 	  free (task);
 	  return true;
 	}
       if (taskgroup)
 	{
-	  if (taskgroup->cancelled)
+	  if (__atomic_load_n (&taskgroup->cancelled, MEMMODEL_ACQUIRE))
 	    goto do_cancel;
 	  if (taskgroup->workshare
 	      && taskgroup->prev
-	      && taskgroup->prev->cancelled)
+	      && __atomic_load_n (&taskgroup->prev->cancelled,
+				  MEMMODEL_ACQUIRE))
 	    goto do_cancel;
 	}
     }
+
   if (depend_size)
     {
       gomp_task_handle_depend (task, parent, depend);
       if (task->num_dependees)
 	{
 	  if (taskgroup)
-	    taskgroup->num_children++;
-	  gomp_mutex_unlock (&team->task_lock);
+	    __atomic_add_fetch (&taskgroup->num_children, 1,
+				MEMMODEL_ACQ_REL);
 	  return true;
 	}
     }
   if (state == GOMP_TARGET_TASK_DATA)
     {
       gomp_task_run_post_handle_depend_hash (task);
-      gomp_mutex_unlock (&team->task_lock);
       gomp_finish_task (task);
       free (task);
       return false;
     }
+
   if (taskgroup)
-    taskgroup->num_children++;
+    __atomic_add_fetch (&taskgroup->num_children, 1, MEMMODEL_ACQ_REL);
+
   /* For async offloading, if we don't need to wait for dependencies,
      run the gomp_target_task_fn right away, essentially schedule the
      mapping part of the task in the current thread.  */
   if (devicep != NULL
       && (devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
     {
-      priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0,
-			     PRIORITY_INSERT_END,
-			     /*adjust_parent_depends_on=*/false,
-			     task->parent_depends_on);
-      if (taskgroup)
-	priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-			       task, 0, PRIORITY_INSERT_END,
-			       /*adjust_parent_depends_on=*/false,
-			       task->parent_depends_on);
-      task->pnode[PQ_TEAM].next = NULL;
-      task->pnode[PQ_TEAM].prev = NULL;
-      task->kind = GOMP_TASK_TIED;
-      ++team->task_count;
-      gomp_mutex_unlock (&team->task_lock);
+      __atomic_store_n (&task->kind, GOMP_TASK_TIED, MEMMODEL_RELEASE);
+      __atomic_add_fetch (&team->task_count, 1, MEMMODEL_ACQ_REL);
+      __atomic_add_fetch (&parent->num_children, 1, MEMMODEL_ACQ_REL);
 
       thr->task = task;
       gomp_target_task_fn (task->fn_data);
       thr->task = parent;
 
-      gomp_mutex_lock (&team->task_lock);
-      task->kind = GOMP_TASK_ASYNC_RUNNING;
-      /* If GOMP_PLUGIN_target_task_completion has run already
-	 in between gomp_target_task_fn and the mutex lock,
-	 perform the requeuing here.  */
-      if (ttask->state == GOMP_TARGET_TASK_FINISHED)
-	gomp_target_task_completion (team, task);
-      else
-	ttask->state = GOMP_TARGET_TASK_RUNNING;
-      gomp_mutex_unlock (&team->task_lock);
-      return true;
-    }
-  priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0,
-			 PRIORITY_INSERT_BEGIN,
-			 /*adjust_parent_depends_on=*/false,
-			 task->parent_depends_on);
-  if (taskgroup)
-    priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, task, 0,
-			   PRIORITY_INSERT_BEGIN,
-			   /*adjust_parent_depends_on=*/false,
-			   task->parent_depends_on);
-  priority_queue_insert (PQ_TEAM, &team->task_queue, task, 0,
-			 PRIORITY_INSERT_END,
-			 /*adjust_parent_depends_on=*/false,
-			 task->parent_depends_on);
-  ++team->task_count;
-  ++team->task_queued_count;
-  gomp_team_barrier_set_task_pending (&team->barrier);
-  do_wake = team->task_running_count + !parent->in_tied_task
-	    < team->nthreads;
-  gomp_mutex_unlock (&team->task_lock);
-  if (do_wake)
-    gomp_team_barrier_wake (&team->barrier, 1);
-  return true;
-}
-
-/* Given a parent_depends_on task in LIST, move it to the front of its
-   priority so it is run as soon as possible.
-
-   Care is taken to update the list's LAST_PARENT_DEPENDS_ON field.
-
-   We rearrange the queue such that all parent_depends_on tasks are
-   first, and last_parent_depends_on points to the last such task we
-   rearranged.  For example, given the following tasks in a queue
-   where PD[123] are the parent_depends_on tasks:
-
-	task->children
-	|
-	V
-	C1 -> C2 -> C3 -> PD1 -> PD2 -> PD3 -> C4
-
-	We rearrange such that:
-
-	task->children
-	|	       +--- last_parent_depends_on
-	|	       |
-	V	       V
-	PD1 -> PD2 -> PD3 -> C1 -> C2 -> C3 -> C4.  */
-
-static void inline
-priority_list_upgrade_task (struct priority_list *list,
-			    struct priority_node *node)
-{
-  struct priority_node *last_parent_depends_on
-    = list->last_parent_depends_on;
-  if (last_parent_depends_on)
-    {
-      node->prev->next = node->next;
-      node->next->prev = node->prev;
-      node->prev = last_parent_depends_on;
-      node->next = last_parent_depends_on->next;
-      node->prev->next = node;
-      node->next->prev = node;
-    }
-  else if (node != list->tasks)
-    {
-      node->prev->next = node->next;
-      node->next->prev = node->prev;
-      node->prev = list->tasks->prev;
-      node->next = list->tasks;
-      list->tasks = node;
-      node->prev->next = node;
-      node->next->prev = node;
-    }
-  list->last_parent_depends_on = node;
-}
-
-/* Given a parent_depends_on TASK in its parent's children_queue, move
-   it to the front of its priority so it is run as soon as possible.
-
-   PARENT is passed as an optimization.
-
-   (This function could be defined in priority_queue.c, but we want it
-   inlined, and putting it in priority_queue.h is not an option, given
-   that gomp_task has not been properly defined at that point).  */
-
-static void inline
-priority_queue_upgrade_task (struct gomp_task *task,
-			     struct gomp_task *parent)
-{
-  struct priority_queue *head = &parent->children_queue;
-  struct priority_node *node = &task->pnode[PQ_CHILDREN];
-#if _LIBGOMP_CHECKING_
-  if (!task->parent_depends_on)
-    gomp_fatal ("priority_queue_upgrade_task: task must be a "
-		"parent_depends_on task");
-  if (!priority_queue_task_in_queue_p (PQ_CHILDREN, head, task))
-    gomp_fatal ("priority_queue_upgrade_task: cannot find task=%p", task);
-#endif
-  if (priority_queue_multi_p (head))
-    {
-      struct priority_list *list
-	= priority_queue_lookup_priority (head, task->priority);
-      priority_list_upgrade_task (list, node);
-    }
-  else
-    priority_list_upgrade_task (&head->l, node);
-}
+      __atomic_store_n (&task->kind, GOMP_TASK_ASYNC_RUNNING,
+			MEMMODEL_RELEASE);
 
-/* Given a CHILD_TASK in LIST that is about to be executed, move it out of
-   the way in LIST so that other tasks can be considered for
-   execution.  LIST contains tasks of type TYPE.
-
-   Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field
-   if applicable.  */
-
-static void inline
-priority_list_downgrade_task (enum priority_queue_type type,
-			      struct priority_list *list,
-			      struct gomp_task *child_task)
-{
-  struct priority_node *node = task_to_priority_node (type, child_task);
-  if (list->tasks == node)
-    list->tasks = node->next;
-  else if (node->next != list->tasks)
-    {
-      /* The task in NODE is about to become TIED and TIED tasks
-	 cannot come before WAITING tasks.  If we're about to
-	 leave the queue in such an indeterminate state, rewire
-	 things appropriately.  However, a TIED task at the end is
-	 perfectly fine.  */
-      struct gomp_task *next_task = priority_node_to_task (type, node->next);
-      if (next_task->kind == GOMP_TASK_WAITING)
-	{
-	  /* Remove from list.  */
-	  node->prev->next = node->next;
-	  node->next->prev = node->prev;
-	  /* Rewire at the end.  */
-	  node->next = list->tasks;
-	  node->prev = list->tasks->prev;
-	  list->tasks->prev->next = node;
-	  list->tasks->prev = node;
+      /* If GOMP_PLUGIN_target_task_completion has run already, perform the
+	 requeuing here.  */
+      if (__atomic_load_n (&ttask->state, MEMMODEL_ACQUIRE)
+	  == GOMP_TARGET_TASK_FINISHED)
+	{
+	  gomp_target_task_completion (team, task, thr);
 	}
-    }
-
-  /* If the current task is the last_parent_depends_on for its
-     priority, adjust last_parent_depends_on appropriately.  */
-  if (__builtin_expect (child_task->parent_depends_on, 0)
-      && list->last_parent_depends_on == node)
-    {
-      struct gomp_task *prev_child = priority_node_to_task (type, node->prev);
-      if (node->prev != node
-	  && prev_child->kind == GOMP_TASK_WAITING
-	  && prev_child->parent_depends_on)
-	list->last_parent_depends_on = node->prev;
-      else
+      else if (__atomic_exchange_n (&ttask->state, GOMP_TARGET_TASK_RUNNING,
+				    MEMMODEL_ACQ_REL)
+	       == GOMP_TARGET_TASK_FINISHED)
 	{
-	  /* There are no more parent_depends_on entries waiting
-	     to run, clear the list.  */
-	  list->last_parent_depends_on = NULL;
+	  gomp_target_task_completion (team, task, thr);
 	}
+      return true;
     }
-}
-
-/* Given a TASK in HEAD that is about to be executed, move it out of
-   the way so that other tasks can be considered for execution.  HEAD
-   contains tasks of type TYPE.
-
-   Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field
-   if applicable.
 
-   (This function could be defined in priority_queue.c, but we want it
-   inlined, and putting it in priority_queue.h is not an option, given
-   that gomp_task has not been properly defined at that point).  */
+  __atomic_add_fetch (&team->task_count, 1, MEMMODEL_ACQ_REL);
+  __atomic_add_fetch (&parent->num_children, 1, MEMMODEL_ACQ_REL);
+  gomp_enqueue_task (task, team, thr, task->priority);
 
-static void inline
-priority_queue_downgrade_task (enum priority_queue_type type,
-			       struct priority_queue *head,
-			       struct gomp_task *task)
-{
-#if _LIBGOMP_CHECKING_
-  if (!priority_queue_task_in_queue_p (type, head, task))
-    gomp_fatal ("Attempt to downgrade missing task %p", task);
-#endif
-  if (priority_queue_multi_p (head))
-    {
-      struct priority_list *list
-	= priority_queue_lookup_priority (head, task->priority);
-      priority_list_downgrade_task (type, list, task);
-    }
-  else
-    priority_list_downgrade_task (type, &head->l, task);
+  gomp_mutex_lock (&team->barrier_lock);
+  gomp_team_barrier_set_task_pending (&team->barrier);
+  gomp_mutex_unlock (&team->barrier_lock);
+  do_wake = __atomic_load_n (&team->task_running_count, MEMMODEL_ACQUIRE)
+	      + !__atomic_load_n (&parent->in_tied_task, MEMMODEL_ACQUIRE)
+	    < team->nthreads;
+  if (do_wake)
+    gomp_team_barrier_wake (&team->barrier, 1);
+  return true;
 }
 
-/* Setup CHILD_TASK to execute.  This is done by setting the task to
-   TIED, and updating all relevant queues so that CHILD_TASK is no
-   longer chosen for scheduling.  Also, remove CHILD_TASK from the
-   overall team task queue entirely.
-
-   Return TRUE if task or its containing taskgroup has been
-   cancelled.  */
-
 static inline bool
-gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
-		   struct gomp_team *team)
+gomp_task_run_pre (struct gomp_task *task, struct gomp_team *team)
 {
-#if _LIBGOMP_CHECKING_
-  if (child_task->parent)
-    priority_queue_verify (PQ_CHILDREN,
-			   &child_task->parent->children_queue, true);
-  if (child_task->taskgroup)
-    priority_queue_verify (PQ_TASKGROUP,
-			   &child_task->taskgroup->taskgroup_queue, false);
-  priority_queue_verify (PQ_TEAM, &team->task_queue, false);
-#endif
-
-  /* Task is about to go tied, move it out of the way.  */
-  if (parent)
-    priority_queue_downgrade_task (PQ_CHILDREN, &parent->children_queue,
-				   child_task);
-
-  /* Task is about to go tied, move it out of the way.  */
-  struct gomp_taskgroup *taskgroup = child_task->taskgroup;
-  if (taskgroup)
-    priority_queue_downgrade_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-				   child_task);
+  struct gomp_taskgroup *taskgroup = task->taskgroup;
 
-  priority_queue_remove (PQ_TEAM, &team->task_queue, child_task,
-			 MEMMODEL_RELAXED);
-  child_task->pnode[PQ_TEAM].next = NULL;
-  child_task->pnode[PQ_TEAM].prev = NULL;
-  child_task->kind = GOMP_TASK_TIED;
-
-  if (--team->task_queued_count == 0)
-    gomp_team_barrier_clear_task_pending (&team->barrier);
-  if (__builtin_expect (gomp_cancel_var, 0)
-      && !child_task->copy_ctors_done)
+  __atomic_store_n (&task->kind, GOMP_TASK_TIED, MEMMODEL_RELEASE);
+  if (__builtin_expect (gomp_cancel_var, 0) && !task->copy_ctors_done)
     {
       if (gomp_team_barrier_cancelled (&team->barrier))
 	return true;
       if (taskgroup)
 	{
-	  if (taskgroup->cancelled)
+	  if (__atomic_load_n (&taskgroup->cancelled, MEMMODEL_ACQUIRE))
 	    return true;
 	  if (taskgroup->workshare
 	      && taskgroup->prev
-	      && taskgroup->prev->cancelled)
+	      && __atomic_load_n (&taskgroup->prev->cancelled,
+				  MEMMODEL_ACQUIRE))
 	    return true;
 	}
     }
@@ -1116,25 +877,27 @@ gomp_task_run_post_handle_depend_hash (s
   size_t i;
 
   for (i = 0; i < child_task->depend_count; i++)
-    if (!child_task->depend[i].redundant)
-      {
-	if (child_task->depend[i].next)
-	  child_task->depend[i].next->prev = child_task->depend[i].prev;
-	if (child_task->depend[i].prev)
-	  child_task->depend[i].prev->next = child_task->depend[i].next;
-	else
-	  {
-	    hash_entry_type *slot
-	      = htab_find_slot (&parent->depend_hash, &child_task->depend[i],
-				NO_INSERT);
-	    if (*slot != &child_task->depend[i])
-	      abort ();
-	    if (child_task->depend[i].next)
-	      *slot = child_task->depend[i].next;
-	    else
-	      htab_clear_slot (parent->depend_hash, slot);
-	  }
-      }
+    {
+      if (!child_task->depend[i].redundant)
+	{
+	  if (child_task->depend[i].next)
+	    child_task->depend[i].next->prev = child_task->depend[i].prev;
+	  if (child_task->depend[i].prev)
+	    child_task->depend[i].prev->next = child_task->depend[i].next;
+	  else
+	    {
+	      hash_entry_type *slot
+		= htab_find_slot (&parent->depend_hash, &child_task->depend[i],
+				  NO_INSERT);
+	      if (*slot != &child_task->depend[i])
+		abort ();
+	      if (child_task->depend[i].next)
+		*slot = child_task->depend[i].next;
+	      else
+		htab_clear_slot (parent->depend_hash, slot);
+	    }
+	}
+    }
 }
 
 /* After a CHILD_TASK has been run, adjust the dependency queue for
@@ -1149,6 +912,7 @@ static size_t
 gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
 				     struct gomp_team *team)
 {
+  struct gomp_thread *thr = gomp_thread ();
   struct gomp_task *parent = child_task->parent;
   size_t i, count = child_task->dependers->n_elem, ret = 0;
   for (i = 0; i < count; i++)
@@ -1159,57 +923,14 @@ gomp_task_run_post_handle_dependers (str
 	 TASK's remaining dependencies.  Once TASK has no other
 	 depenencies, put it into the various queues so it will get
 	 scheduled for execution.  */
-      if (--task->num_dependees != 0)
+      if (__atomic_sub_fetch (&task->num_dependees, 1, MEMMODEL_ACQ_REL) != 0)
 	continue;
 
-      struct gomp_taskgroup *taskgroup = task->taskgroup;
+      __atomic_add_fetch (&team->task_count, 1, MEMMODEL_ACQ_REL);
       if (parent)
-	{
-	  priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
-				 task, task->priority,
-				 PRIORITY_INSERT_BEGIN,
-				 /*adjust_parent_depends_on=*/true,
-				 task->parent_depends_on);
-	  if (parent->taskwait)
-	    {
-	      if (parent->taskwait->in_taskwait)
-		{
-		  /* One more task has had its dependencies met.
-		     Inform any waiters.  */
-		  parent->taskwait->in_taskwait = false;
-		  gomp_sem_post (&parent->taskwait->taskwait_sem);
-		}
-	      else if (parent->taskwait->in_depend_wait)
-		{
-		  /* One more task has had its dependencies met.
-		     Inform any waiters.  */
-		  parent->taskwait->in_depend_wait = false;
-		  gomp_sem_post (&parent->taskwait->taskwait_sem);
-		}
-	    }
-	}
-      if (taskgroup)
-	{
-	  priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-				 task, task->priority,
-				 PRIORITY_INSERT_BEGIN,
-				 /*adjust_parent_depends_on=*/false,
-				 task->parent_depends_on);
-	  if (taskgroup->in_taskgroup_wait)
-	    {
-	      /* One more task has had its dependencies met.
-		 Inform any waiters.  */
-	      taskgroup->in_taskgroup_wait = false;
-	      gomp_sem_post (&taskgroup->taskgroup_sem);
-	    }
-	}
-      priority_queue_insert (PQ_TEAM, &team->task_queue,
-			     task, task->priority,
-			     PRIORITY_INSERT_END,
-			     /*adjust_parent_depends_on=*/false,
-			     task->parent_depends_on);
-      ++team->task_count;
-      ++team->task_queued_count;
+	  __atomic_add_fetch (&parent->num_children, 1, MEMMODEL_ACQ_REL);
+
+      gomp_enqueue_task (task, team, thr, task->priority);
       ++ret;
     }
   free (child_task->dependers);
@@ -1223,80 +944,166 @@ static inline size_t
 gomp_task_run_post_handle_depend (struct gomp_task *child_task,
 				  struct gomp_team *team)
 {
-  if (child_task->depend_count == 0)
-    return 0;
-
-  /* If parent is gone already, the hash table is freed and nothing
-     will use the hash table anymore, no need to remove anything from it.  */
-  if (child_task->parent != NULL)
-    gomp_task_run_post_handle_depend_hash (child_task);
+  size_t new_tasks = 0;
+  struct gomp_task *parent = child_task->parent;
 
-  if (child_task->dependers == NULL)
+  if (child_task->depend_count == 0)
     return 0;
 
-  return gomp_task_run_post_handle_dependers (child_task, team);
+  gomp_mutex_lock (&parent->depend_lock);
+  gomp_task_run_post_handle_depend_hash (child_task);
+  if (child_task->dependers != NULL)
+      new_tasks = gomp_task_run_post_handle_dependers (child_task, team);
+  gomp_mutex_unlock (&parent->depend_lock);
+  return new_tasks;
 }
 
 /* Remove CHILD_TASK from its parent.  */
 
 static inline void
-gomp_task_run_post_remove_parent (struct gomp_task *child_task)
+gomp_task_run_post_notify_parent (struct gomp_task *child_task)
 {
   struct gomp_task *parent = child_task->parent;
   if (parent == NULL)
     return;
 
-  /* If this was the last task the parent was depending on,
-     synchronize with gomp_task_maybe_wait_for_dependencies so it can
-     clean up and return.  */
-  if (__builtin_expect (child_task->parent_depends_on, 0)
-      && --parent->taskwait->n_depend == 0
-      && parent->taskwait->in_depend_wait)
-    {
-      parent->taskwait->in_depend_wait = false;
-      gomp_sem_post (&parent->taskwait->taskwait_sem);
-    }
+  if (__builtin_expect (child_task->parent_depends_on, 0))
+    __atomic_sub_fetch (&parent->num_awaited, 1, MEMMODEL_ACQ_REL);
 
-  if (priority_queue_remove (PQ_CHILDREN, &parent->children_queue,
-			     child_task, MEMMODEL_RELEASE)
-      && parent->taskwait && parent->taskwait->in_taskwait)
-    {
-      parent->taskwait->in_taskwait = false;
-      gomp_sem_post (&parent->taskwait->taskwait_sem);
+  /* The moment we decrement num_children we enter a critical section with
+     the thread executing the parent.  Mark the task_state as critical
+     section.  If the parent was already done executing, task->state will be
+     GOMP_STATE_DONE.  If the parent finished executing while in the critical
+     section, task->state will be GOMP_STATE_DONE when we exchange before
+     exiting the critical section.  In that case the responsibility of freeing
+     the task has been defered to the current child task.  */
+
+  if (__atomic_load_n (&parent->num_children, MEMMODEL_ACQUIRE) == 1)
+    {
+      enum gomp_task_state state
+	= __atomic_exchange_n (&parent->state, GOMP_STATE_CRITICAL,
+			       MEMMODEL_ACQ_REL);
+      __atomic_sub_fetch (&parent->num_children, 1, MEMMODEL_ACQ_REL);
+      /* Enter critical section.  */
+      if (state == GOMP_STATE_DONE)
+	{
+	  /* Guaranteed: task->state was already GOMP_STATE_DONE before
+	     decrementing.  */
+	free_task:;
+	  gomp_finish_task (parent);
+	  free (parent);
+	}
+      else if (__atomic_exchange_n (&parent->state, GOMP_STATE_NORMAL,
+				    MEMMODEL_ACQ_REL)
+	       == GOMP_STATE_DONE)
+	/* Guaranteed: task->state was GOMP_DONE after we entered and
+	   before we left the critical section.  */
+	goto free_task;
+      /* Exit critical section.  */
     }
-  child_task->pnode[PQ_CHILDREN].next = NULL;
-  child_task->pnode[PQ_CHILDREN].prev = NULL;
+  else
+    __atomic_sub_fetch (&parent->num_children, 1, MEMMODEL_ACQ_REL);
 }
 
-/* Remove CHILD_TASK from its taskgroup.  */
+/* Notify taskgroup that a children has finished executing.  */
 
 static inline void
-gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task)
+gomp_task_run_post_notify_taskgroup (struct gomp_task *child_task)
 {
   struct gomp_taskgroup *taskgroup = child_task->taskgroup;
   if (taskgroup == NULL)
     return;
-  bool empty = priority_queue_remove (PQ_TASKGROUP,
-				      &taskgroup->taskgroup_queue,
-				      child_task, MEMMODEL_RELAXED);
-  child_task->pnode[PQ_TASKGROUP].next = NULL;
-  child_task->pnode[PQ_TASKGROUP].prev = NULL;
-  if (taskgroup->num_children > 1)
-    --taskgroup->num_children;
-  else
+  __atomic_sub_fetch (&taskgroup->num_children, 1, MEMMODEL_ACQ_REL);
+}
+
+/* Executes all tasks until the team queue is empty.
+
+   The caller will check for a stop criterion and call again if the criterion
+   hasn't been met.  true  is returned if the routine was exited because the
+   team queue was empty.  *team, *thr are passed as an optimization.  */
+
+static inline bool
+gomp_execute_task (struct gomp_team *team, struct gomp_thread *thr,
+		   struct gomp_task *task)
+{
+  bool cancelled = false;
+  struct gomp_task *next_task = NULL;
+
+  next_task = gomp_dequeue_task (team, thr);
+  if (next_task == NULL)
+    return false;
+
+  cancelled = gomp_task_run_pre (next_task, team);
+  if (__builtin_expect (cancelled, 0))
+      goto finish_cancelled;
+
+  thr->task = next_task;
+  if (__builtin_expect (next_task->fn == NULL, 0))
     {
-      /* We access taskgroup->num_children in GOMP_taskgroup_end
-	 outside of the task lock mutex region, so
-	 need a release barrier here to ensure memory
-	 written by child_task->fn above is flushed
-	 before the NULL is written.  */
-      __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE);
+      if (gomp_target_task_fn (next_task->fn_data))
+	{
+	  thr->task = task;
+	  __atomic_store_n (&next_task->kind, GOMP_TASK_ASYNC_RUNNING,
+			    MEMMODEL_RELEASE);
+	  struct gomp_target_task *ttask
+	    = (struct gomp_target_task *) next_task->fn_data;
+	  /* If GOMP_PLUGIN_target_task_completion has run already
+	     perform the requeuing here.  */
+
+	  if (__atomic_load_n (&ttask->state, MEMMODEL_ACQUIRE)
+	      == GOMP_TARGET_TASK_FINISHED)
+	    {
+	      gomp_target_task_completion (team, task, thr);
+	    }
+	  else if (__atomic_exchange_n (&ttask->state, GOMP_TARGET_TASK_RUNNING,
+					MEMMODEL_ACQ_REL)
+		   == GOMP_TARGET_TASK_FINISHED)
+	    {
+	      gomp_target_task_completion (team, task, thr);
+	    }
+	  return true;
+	}
+    }
+  else
+    next_task->fn (next_task->fn_data);
+  thr->task = task;
+
+finish_cancelled:;
+  size_t new_tasks = gomp_task_run_post_handle_depend (next_task, team);
+
+  gomp_task_run_post_notify_parent (next_task);
+  gomp_task_run_post_notify_taskgroup (next_task);
+
+  __atomic_sub_fetch (&team->task_count, 1, MEMMODEL_ACQ_REL);
+  if (new_tasks > 1)
+    {
+      int do_wake
+	= team->nthreads
+	  - __atomic_load_n (&team->task_running_count, MEMMODEL_ACQUIRE)
+	  - !__atomic_load_n (&task->in_tied_task, MEMMODEL_ACQUIRE);
+      if (do_wake > new_tasks)
+	do_wake = new_tasks;
+
+      if (do_wake)
+	gomp_team_barrier_wake (&team->barrier, do_wake);
     }
-  if (empty && taskgroup->in_taskgroup_wait)
+
+  /* If the task don't have any children and nobody is in the critical section,
+     task->state is GOMP_STATE_NORMAL.  Then we can free it right away.  If the
+     executing thread of a child task has entered the critical section,
+     task->state will be GOMP_STATE_CRITICAL.  In that case we store
+     GOMP_STATE_DONE and defer the freeing to the child.  */
+
+  if (__atomic_load_n (&next_task->num_children, MEMMODEL_ACQUIRE) == 0
+      && __atomic_exchange_n (&next_task->state, GOMP_STATE_DONE,
+			      MEMMODEL_ACQ_REL)
+	   == GOMP_STATE_NORMAL)
     {
-      taskgroup->in_taskgroup_wait = false;
-      gomp_sem_post (&taskgroup->taskgroup_sem);
+      gomp_finish_task (next_task);
+      free (next_task);
     }
+
+  return true;
 }
 
 void
@@ -1305,117 +1112,119 @@ gomp_barrier_handle_tasks (gomp_barrier_
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
   struct gomp_task *task = thr->task;
-  struct gomp_task *child_task = NULL;
-  struct gomp_task *to_free = NULL;
-  int do_wake = 0;
+  struct gomp_task *next_task = NULL;
 
-  gomp_mutex_lock (&team->task_lock);
+  gomp_mutex_lock (&team->barrier_lock);
   if (gomp_barrier_last_thread (state))
     {
-      if (team->task_count == 0)
+      if (__atomic_load_n (&team->task_count, MEMMODEL_ACQUIRE) == 0)
 	{
 	  gomp_team_barrier_done (&team->barrier, state);
-	  gomp_mutex_unlock (&team->task_lock);
+	  gomp_mutex_unlock (&team->barrier_lock);
 	  gomp_team_barrier_wake (&team->barrier, 0);
 	  return;
 	}
       gomp_team_barrier_set_waiting_for_tasks (&team->barrier);
     }
+  gomp_mutex_unlock (&team->barrier_lock);
 
-  while (1)
+  while (true)
     {
       bool cancelled = false;
-      if (!priority_queue_empty_p (&team->task_queue, MEMMODEL_RELAXED))
-	{
-	  bool ignored;
-	  child_task
-	    = priority_queue_next_task (PQ_TEAM, &team->task_queue,
-					PQ_IGNORED, NULL,
-					&ignored);
-	  cancelled = gomp_task_run_pre (child_task, child_task->parent,
-					 team);
-	  if (__builtin_expect (cancelled, 0))
-	    {
-	      if (to_free)
-		{
-		  gomp_finish_task (to_free);
-		  free (to_free);
-		  to_free = NULL;
-		}
-	      goto finish_cancelled;
-	    }
-	  team->task_running_count++;
-	  child_task->in_tied_task = true;
-	}
-      gomp_mutex_unlock (&team->task_lock);
-      if (do_wake)
-	{
-	  gomp_team_barrier_wake (&team->barrier, do_wake);
-	  do_wake = 0;
-	}
-      if (to_free)
+      next_task = gomp_dequeue_task (team, thr);
+      if (next_task == NULL)
+	return;
+
+      cancelled = gomp_task_run_pre (next_task, team);
+      if (__builtin_expect (cancelled, 0))
 	{
-	  gomp_finish_task (to_free);
-	  free (to_free);
-	  to_free = NULL;
+	  goto finish_cancelled;
 	}
-      if (child_task)
+
+      __atomic_add_fetch (&team->task_running_count, 1, MEMMODEL_ACQ_REL);
+      __atomic_store_n (&next_task->in_tied_task, true, MEMMODEL_RELEASE);
+
+      thr->task = next_task;
+      if (__builtin_expect (next_task->fn == NULL, 0))
 	{
-	  thr->task = child_task;
-	  if (__builtin_expect (child_task->fn == NULL, 0))
+	  if (gomp_target_task_fn (next_task->fn_data))
 	    {
-	      if (gomp_target_task_fn (child_task->fn_data))
+	      thr->task = task;
+	      __atomic_store_n (&next_task->kind, GOMP_TASK_ASYNC_RUNNING,
+				MEMMODEL_RELEASE);
+	      __atomic_sub_fetch (&team->task_running_count, 1,
+				  MEMMODEL_ACQ_REL);
+	      struct gomp_target_task *ttask
+		= (struct gomp_target_task *) next_task->fn_data;
+	      /* If GOMP_PLUGIN_target_task_completion has run already
+		 in between gomp_target_task_fn and the mutex lock,
+		 perform the requeuing here.  */
+
+	      if (__atomic_load_n (&ttask->state, MEMMODEL_ACQUIRE)
+		  == GOMP_TARGET_TASK_FINISHED)
 		{
-		  thr->task = task;
-		  gomp_mutex_lock (&team->task_lock);
-		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
-		  team->task_running_count--;
-		  struct gomp_target_task *ttask
-		    = (struct gomp_target_task *) child_task->fn_data;
-		  /* If GOMP_PLUGIN_target_task_completion has run already
-		     in between gomp_target_task_fn and the mutex lock,
-		     perform the requeuing here.  */
-		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
-		    gomp_target_task_completion (team, child_task);
-		  else
-		    ttask->state = GOMP_TARGET_TASK_RUNNING;
-		  child_task = NULL;
-		  continue;
+		  gomp_target_task_completion (team, task, thr);
+		}
+	      else if (__atomic_exchange_n (&ttask->state,
+					    GOMP_TARGET_TASK_RUNNING,
+					    MEMMODEL_ACQ_REL)
+		       == GOMP_TARGET_TASK_FINISHED)
+		{
+		  gomp_target_task_completion (team, task, thr);
 		}
+	      continue;
 	    }
-	  else
-	    child_task->fn (child_task->fn_data);
-	  thr->task = task;
 	}
       else
-	return;
-      gomp_mutex_lock (&team->task_lock);
-      if (child_task)
+	next_task->fn (next_task->fn_data);
+      thr->task = task;
+
+    finish_cancelled:;
+      size_t new_tasks = gomp_task_run_post_handle_depend (next_task, team);
+      gomp_task_run_post_notify_parent (next_task);
+      gomp_task_run_post_notify_taskgroup (next_task);
+
+      if (!cancelled)
+	__atomic_sub_fetch (&team->task_running_count, 1, MEMMODEL_ACQ_REL);
+
+      if (new_tasks > 1)
+	{
+	  int do_wake
+	    = team->nthreads
+	      - __atomic_load_n (&team->task_running_count, MEMMODEL_ACQUIRE);
+	  if (do_wake > new_tasks)
+	    do_wake = new_tasks;
+
+	  if (do_wake)
+	    gomp_team_barrier_wake (&team->barrier, do_wake);
+	}
+
+      gomp_mutex_lock (&team->barrier_lock);
+      bool signal_barrier
+	= __atomic_sub_fetch (&team->task_count, 1, MEMMODEL_ACQ_REL) == 0
+	  && gomp_team_barrier_waiting_for_tasks (&team->barrier);
+      if (signal_barrier)
 	{
-	 finish_cancelled:;
-	  size_t new_tasks
-	    = gomp_task_run_post_handle_depend (child_task, team);
-	  gomp_task_run_post_remove_parent (child_task);
-	  gomp_clear_parent (&child_task->children_queue);
-	  gomp_task_run_post_remove_taskgroup (child_task);
-	  to_free = child_task;
-	  child_task = NULL;
-	  if (!cancelled)
-	    team->task_running_count--;
-	  if (new_tasks > 1)
-	    {
-	      do_wake = team->nthreads - team->task_running_count;
-	      if (do_wake > new_tasks)
-		do_wake = new_tasks;
-	    }
-	  if (--team->task_count == 0
-	      && gomp_team_barrier_waiting_for_tasks (&team->barrier))
-	    {
-	      gomp_team_barrier_done (&team->barrier, state);
-	      gomp_mutex_unlock (&team->task_lock);
-	      gomp_team_barrier_wake (&team->barrier, 0);
-	      gomp_mutex_lock (&team->task_lock);
-	    }
+	  gomp_team_barrier_done (&team->barrier, state);
+	  gomp_mutex_unlock (&team->barrier_lock);
+	  gomp_team_barrier_wake (&team->barrier, 0);
+	}
+      else
+	gomp_mutex_unlock (&team->barrier_lock);
+
+      /* If the task don't have any children and nobody is in the critical
+	 section, task->state is GOMP_STATE_NORMAL.  Then we can free it right
+	 away.  If the executing thread of a child task has entered the critical
+	 section, task->state will be GOMP_STATE_CRITICAL.  In that case we
+	 store GOMP_STATE_DONE and defer the freeing to the child.  */
+
+      if (__atomic_load_n (&next_task->num_children, MEMMODEL_ACQUIRE) == 0
+	  && __atomic_exchange_n (&next_task->state, GOMP_STATE_DONE,
+				  MEMMODEL_ACQ_REL)
+	       == GOMP_STATE_NORMAL)
+	{
+	  gomp_finish_task (next_task);
+	  free (next_task);
 	}
     }
 }
@@ -1430,10 +1239,6 @@ GOMP_taskwait (void)
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
   struct gomp_task *task = thr->task;
-  struct gomp_task *child_task = NULL;
-  struct gomp_task *to_free = NULL;
-  struct gomp_taskwait taskwait;
-  int do_wake = 0;
 
   /* The acquire barrier on load of task->children here synchronizes
      with the write of a NULL in gomp_task_run_post_remove_parent.  It is
@@ -1442,134 +1247,11 @@ GOMP_taskwait (void)
      child thread task work function are seen before we exit from
      GOMP_taskwait.  */
   if (task == NULL
-      || priority_queue_empty_p (&task->children_queue, MEMMODEL_ACQUIRE))
+      || __atomic_load_n (&task->num_children, MEMMODEL_ACQUIRE) == 0)
     return;
 
-  memset (&taskwait, 0, sizeof (taskwait));
-  bool child_q = false;
-  gomp_mutex_lock (&team->task_lock);
-  while (1)
-    {
-      bool cancelled = false;
-      if (priority_queue_empty_p (&task->children_queue, MEMMODEL_RELAXED))
-	{
-	  bool destroy_taskwait = task->taskwait != NULL;
-	  task->taskwait = NULL;
-	  gomp_mutex_unlock (&team->task_lock);
-	  if (to_free)
-	    {
-	      gomp_finish_task (to_free);
-	      free (to_free);
-	    }
-	  if (destroy_taskwait)
-	    gomp_sem_destroy (&taskwait.taskwait_sem);
-	  return;
-	}
-      struct gomp_task *next_task
-	= priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
-				    PQ_TEAM, &team->task_queue, &child_q);
-      if (next_task->kind == GOMP_TASK_WAITING)
-	{
-	  child_task = next_task;
-	  cancelled
-	    = gomp_task_run_pre (child_task, task, team);
-	  if (__builtin_expect (cancelled, 0))
-	    {
-	      if (to_free)
-		{
-		  gomp_finish_task (to_free);
-		  free (to_free);
-		  to_free = NULL;
-		}
-	      goto finish_cancelled;
-	    }
-	}
-      else
-	{
-	/* All tasks we are waiting for are either running in other
-	   threads, or they are tasks that have not had their
-	   dependencies met (so they're not even in the queue).  Wait
-	   for them.  */
-	  if (task->taskwait == NULL)
-	    {
-	      taskwait.in_depend_wait = false;
-	      gomp_sem_init (&taskwait.taskwait_sem, 0);
-	      task->taskwait = &taskwait;
-	    }
-	  taskwait.in_taskwait = true;
-	}
-      gomp_mutex_unlock (&team->task_lock);
-      if (do_wake)
-	{
-	  gomp_team_barrier_wake (&team->barrier, do_wake);
-	  do_wake = 0;
-	}
-      if (to_free)
-	{
-	  gomp_finish_task (to_free);
-	  free (to_free);
-	  to_free = NULL;
-	}
-      if (child_task)
-	{
-	  thr->task = child_task;
-	  if (__builtin_expect (child_task->fn == NULL, 0))
-	    {
-	      if (gomp_target_task_fn (child_task->fn_data))
-		{
-		  thr->task = task;
-		  gomp_mutex_lock (&team->task_lock);
-		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
-		  struct gomp_target_task *ttask
-		    = (struct gomp_target_task *) child_task->fn_data;
-		  /* If GOMP_PLUGIN_target_task_completion has run already
-		     in between gomp_target_task_fn and the mutex lock,
-		     perform the requeuing here.  */
-		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
-		    gomp_target_task_completion (team, child_task);
-		  else
-		    ttask->state = GOMP_TARGET_TASK_RUNNING;
-		  child_task = NULL;
-		  continue;
-		}
-	    }
-	  else
-	    child_task->fn (child_task->fn_data);
-	  thr->task = task;
-	}
-      else
-	gomp_sem_wait (&taskwait.taskwait_sem);
-      gomp_mutex_lock (&team->task_lock);
-      if (child_task)
-	{
-	 finish_cancelled:;
-	  size_t new_tasks
-	    = gomp_task_run_post_handle_depend (child_task, team);
-
-	  if (child_q)
-	    {
-	      priority_queue_remove (PQ_CHILDREN, &task->children_queue,
-				     child_task, MEMMODEL_RELAXED);
-	      child_task->pnode[PQ_CHILDREN].next = NULL;
-	      child_task->pnode[PQ_CHILDREN].prev = NULL;
-	    }
-
-	  gomp_clear_parent (&child_task->children_queue);
-
-	  gomp_task_run_post_remove_taskgroup (child_task);
-
-	  to_free = child_task;
-	  child_task = NULL;
-	  team->task_count--;
-	  if (new_tasks > 1)
-	    {
-	      do_wake = team->nthreads - team->task_running_count
-			- !task->in_tied_task;
-	      if (do_wake > new_tasks)
-		do_wake = new_tasks;
-	    }
-	}
-    }
+  while (__atomic_load_n (&task->num_children, MEMMODEL_RELAXED) != 0)
+    gomp_execute_task (team, thr, task);
 }
 
 /* Called when encountering a taskwait directive with depend clause(s).
@@ -1588,11 +1270,13 @@ GOMP_taskwait_depend (void **depend)
 	return;
       if (thr->task->taskgroup)
 	{
-	  if (thr->task->taskgroup->cancelled)
+	  if (__atomic_load_n (&thr->task->taskgroup->cancelled,
+			       MEMMODEL_ACQUIRE))
 	    return;
 	  if (thr->task->taskgroup->workshare
 	      && thr->task->taskgroup->prev
-	      && thr->task->taskgroup->prev->cancelled)
+	      && __atomic_load_n (&thr->task->taskgroup->prev->cancelled,
+				  MEMMODEL_ACQUIRE))
 	    return;
 	}
     }
@@ -1621,7 +1305,6 @@ gomp_task_maybe_wait_for_dependencies (v
   struct gomp_task *task = thr->task;
   struct gomp_team *team = thr->ts.team;
   struct gomp_task_depend_entry elem, *ent = NULL;
-  struct gomp_taskwait taskwait;
   size_t orig_ndepend = (uintptr_t) depend[0];
   size_t nout = (uintptr_t) depend[1];
   size_t ndepend = orig_ndepend;
@@ -1629,9 +1312,6 @@ gomp_task_maybe_wait_for_dependencies (v
   size_t n = 2;
   size_t i;
   size_t num_awaited = 0;
-  struct gomp_task *child_task = NULL;
-  struct gomp_task *to_free = NULL;
-  int do_wake = 0;
 
   if (ndepend == 0)
     {
@@ -1640,7 +1320,7 @@ gomp_task_maybe_wait_for_dependencies (v
       normal = nout + (uintptr_t) depend[4];
       n = 5;
     }
-  gomp_mutex_lock (&team->task_lock);
+  gomp_mutex_lock (&task->depend_lock);
   for (i = 0; i < ndepend; i++)
     {
       elem.addr = depend[i + n];
@@ -1674,150 +1354,17 @@ gomp_task_maybe_wait_for_dependencies (v
 	      {
 		tsk->parent_depends_on = true;
 		++num_awaited;
-		/* If depenency TSK itself has no dependencies and is
-		   ready to run, move it up front so that we run it as
-		   soon as possible.  */
-		if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING)
-		  priority_queue_upgrade_task (tsk, task);
 	      }
 	  }
     }
-  if (num_awaited == 0)
-    {
-      gomp_mutex_unlock (&team->task_lock);
-      return;
-    }
-
-  memset (&taskwait, 0, sizeof (taskwait));
-  taskwait.n_depend = num_awaited;
-  gomp_sem_init (&taskwait.taskwait_sem, 0);
-  task->taskwait = &taskwait;
+  gomp_mutex_unlock (&task->depend_lock);
 
-  while (1)
-    {
-      bool cancelled = false;
-      if (taskwait.n_depend == 0)
-	{
-	  task->taskwait = NULL;
-	  gomp_mutex_unlock (&team->task_lock);
-	  if (to_free)
-	    {
-	      gomp_finish_task (to_free);
-	      free (to_free);
-	    }
-	  gomp_sem_destroy (&taskwait.taskwait_sem);
-	  return;
-	}
+  if (__atomic_add_fetch (&task->num_awaited, num_awaited, MEMMODEL_ACQ_REL)
+      == 0)
+    return;
 
-      /* Theoretically when we have multiple priorities, we should
-	 chose between the highest priority item in
-	 task->children_queue and team->task_queue here, so we should
-	 use priority_queue_next_task().  However, since we are
-	 running an undeferred task, perhaps that makes all tasks it
-	 depends on undeferred, thus a priority of INF?  This would
-	 make it unnecessary to take anything into account here,
-	 but the dependencies.
-
-	 On the other hand, if we want to use priority_queue_next_task(),
-	 care should be taken to only use priority_queue_remove()
-	 below if the task was actually removed from the children
-	 queue.  */
-      bool ignored;
-      struct gomp_task *next_task
-	= priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
-				    PQ_IGNORED, NULL, &ignored);
-
-      if (next_task->kind == GOMP_TASK_WAITING)
-	{
-	  child_task = next_task;
-	  cancelled
-	    = gomp_task_run_pre (child_task, task, team);
-	  if (__builtin_expect (cancelled, 0))
-	    {
-	      if (to_free)
-		{
-		  gomp_finish_task (to_free);
-		  free (to_free);
-		  to_free = NULL;
-		}
-	      goto finish_cancelled;
-	    }
-	}
-      else
-	/* All tasks we are waiting for are either running in other
-	   threads, or they are tasks that have not had their
-	   dependencies met (so they're not even in the queue).  Wait
-	   for them.  */
-	taskwait.in_depend_wait = true;
-      gomp_mutex_unlock (&team->task_lock);
-      if (do_wake)
-	{
-	  gomp_team_barrier_wake (&team->barrier, do_wake);
-	  do_wake = 0;
-	}
-      if (to_free)
-	{
-	  gomp_finish_task (to_free);
-	  free (to_free);
-	  to_free = NULL;
-	}
-      if (child_task)
-	{
-	  thr->task = child_task;
-	  if (__builtin_expect (child_task->fn == NULL, 0))
-	    {
-	      if (gomp_target_task_fn (child_task->fn_data))
-		{
-		  thr->task = task;
-		  gomp_mutex_lock (&team->task_lock);
-		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
-		  struct gomp_target_task *ttask
-		    = (struct gomp_target_task *) child_task->fn_data;
-		  /* If GOMP_PLUGIN_target_task_completion has run already
-		     in between gomp_target_task_fn and the mutex lock,
-		     perform the requeuing here.  */
-		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
-		    gomp_target_task_completion (team, child_task);
-		  else
-		    ttask->state = GOMP_TARGET_TASK_RUNNING;
-		  child_task = NULL;
-		  continue;
-		}
-	    }
-	  else
-	    child_task->fn (child_task->fn_data);
-	  thr->task = task;
-	}
-      else
-	gomp_sem_wait (&taskwait.taskwait_sem);
-      gomp_mutex_lock (&team->task_lock);
-      if (child_task)
-	{
-	 finish_cancelled:;
-	  size_t new_tasks
-	    = gomp_task_run_post_handle_depend (child_task, team);
-	  if (child_task->parent_depends_on)
-	    --taskwait.n_depend;
-
-	  priority_queue_remove (PQ_CHILDREN, &task->children_queue,
-				 child_task, MEMMODEL_RELAXED);
-	  child_task->pnode[PQ_CHILDREN].next = NULL;
-	  child_task->pnode[PQ_CHILDREN].prev = NULL;
-
-	  gomp_clear_parent (&child_task->children_queue);
-	  gomp_task_run_post_remove_taskgroup (child_task);
-	  to_free = child_task;
-	  child_task = NULL;
-	  team->task_count--;
-	  if (new_tasks > 1)
-	    {
-	      do_wake = team->nthreads - team->task_running_count
-			- !task->in_tied_task;
-	      if (do_wake > new_tasks)
-		do_wake = new_tasks;
-	    }
-	}
-    }
+  while (__atomic_load_n (&task->num_awaited, MEMMODEL_ACQUIRE) != 0)
+      gomp_execute_task (team, thr, task);
 }
 
 /* Called when encountering a taskyield directive.  */
@@ -1834,13 +1381,11 @@ gomp_taskgroup_init (struct gomp_taskgro
   struct gomp_taskgroup *taskgroup
     = gomp_malloc (sizeof (struct gomp_taskgroup));
   taskgroup->prev = prev;
-  priority_queue_init (&taskgroup->taskgroup_queue);
   taskgroup->reductions = prev ? prev->reductions : NULL;
   taskgroup->in_taskgroup_wait = false;
   taskgroup->cancelled = false;
   taskgroup->workshare = false;
   taskgroup->num_children = 0;
-  gomp_sem_init (&taskgroup->taskgroup_sem, 0);
   return taskgroup;
 }
 
@@ -1867,15 +1412,11 @@ GOMP_taskgroup_end (void)
   struct gomp_team *team = thr->ts.team;
   struct gomp_task *task = thr->task;
   struct gomp_taskgroup *taskgroup;
-  struct gomp_task *child_task = NULL;
-  struct gomp_task *to_free = NULL;
-  int do_wake = 0;
 
   if (team == NULL)
     return;
   taskgroup = task->taskgroup;
-  if (__builtin_expect (taskgroup == NULL, 0)
-      && thr->ts.level == 0)
+  if (__builtin_expect (taskgroup == NULL, 0) && thr->ts.level == 0)
     {
       /* This can happen if GOMP_taskgroup_start is called when
 	 thr->ts.team == NULL, but inside of the taskgroup there
@@ -1895,128 +1436,10 @@ GOMP_taskgroup_end (void)
   if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0)
     goto finish;
 
-  bool unused;
-  gomp_mutex_lock (&team->task_lock);
-  while (1)
-    {
-      bool cancelled = false;
-      if (priority_queue_empty_p (&taskgroup->taskgroup_queue,
-				  MEMMODEL_RELAXED))
-	{
-	  if (taskgroup->num_children)
-	    {
-	      if (priority_queue_empty_p (&task->children_queue,
-					  MEMMODEL_RELAXED))
-		goto do_wait;
-	      child_task
-		= priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
-					    PQ_TEAM, &team->task_queue,
-					    &unused);
-	    }
-	  else
-	    {
-	      gomp_mutex_unlock (&team->task_lock);
-	      if (to_free)
-		{
-		  gomp_finish_task (to_free);
-		  free (to_free);
-		}
-	      goto finish;
-	    }
-	}
-      else
-	child_task
-	  = priority_queue_next_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-				      PQ_TEAM, &team->task_queue, &unused);
-      if (child_task->kind == GOMP_TASK_WAITING)
-	{
-	  cancelled
-	    = gomp_task_run_pre (child_task, child_task->parent, team);
-	  if (__builtin_expect (cancelled, 0))
-	    {
-	      if (to_free)
-		{
-		  gomp_finish_task (to_free);
-		  free (to_free);
-		  to_free = NULL;
-		}
-	      goto finish_cancelled;
-	    }
-	}
-      else
-	{
-	  child_task = NULL;
-	 do_wait:
-	/* All tasks we are waiting for are either running in other
-	   threads, or they are tasks that have not had their
-	   dependencies met (so they're not even in the queue).  Wait
-	   for them.  */
-	  taskgroup->in_taskgroup_wait = true;
-	}
-      gomp_mutex_unlock (&team->task_lock);
-      if (do_wake)
-	{
-	  gomp_team_barrier_wake (&team->barrier, do_wake);
-	  do_wake = 0;
-	}
-      if (to_free)
-	{
-	  gomp_finish_task (to_free);
-	  free (to_free);
-	  to_free = NULL;
-	}
-      if (child_task)
-	{
-	  thr->task = child_task;
-	  if (__builtin_expect (child_task->fn == NULL, 0))
-	    {
-	      if (gomp_target_task_fn (child_task->fn_data))
-		{
-		  thr->task = task;
-		  gomp_mutex_lock (&team->task_lock);
-		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
-		  struct gomp_target_task *ttask
-		    = (struct gomp_target_task *) child_task->fn_data;
-		  /* If GOMP_PLUGIN_target_task_completion has run already
-		     in between gomp_target_task_fn and the mutex lock,
-		     perform the requeuing here.  */
-		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
-		    gomp_target_task_completion (team, child_task);
-		  else
-		    ttask->state = GOMP_TARGET_TASK_RUNNING;
-		  child_task = NULL;
-		  continue;
-		}
-	    }
-	  else
-	    child_task->fn (child_task->fn_data);
-	  thr->task = task;
-	}
-      else
-	gomp_sem_wait (&taskgroup->taskgroup_sem);
-      gomp_mutex_lock (&team->task_lock);
-      if (child_task)
-	{
-	 finish_cancelled:;
-	  size_t new_tasks
-	    = gomp_task_run_post_handle_depend (child_task, team);
-	  gomp_task_run_post_remove_parent (child_task);
-	  gomp_clear_parent (&child_task->children_queue);
-	  gomp_task_run_post_remove_taskgroup (child_task);
-	  to_free = child_task;
-	  child_task = NULL;
-	  team->task_count--;
-	  if (new_tasks > 1)
-	    {
-	      do_wake = team->nthreads - team->task_running_count
-			- !task->in_tied_task;
-	      if (do_wake > new_tasks)
-		do_wake = new_tasks;
-	    }
-	}
-    }
+  while (__atomic_load_n (&taskgroup->num_children, MEMMODEL_RELAXED) != 0)
+    gomp_execute_task (team, thr, task);
 
- finish:
+ finish:;
   task->taskgroup = taskgroup->prev;
   gomp_sem_destroy (&taskgroup->taskgroup_sem);
   free (taskgroup);
@@ -2124,14 +1547,15 @@ gomp_create_artificial_team (void)
   thr->ts.single_count = 0;
 #endif
   thr->ts.static_trip = 0;
-  thr->task = &team->implicit_task[0];
+  struct gomp_task *implicit_task = gomp_team_implicit_task (team);
+  thr->task = &implicit_task[0];
   gomp_init_task (thr->task, NULL, icv);
   if (task)
     {
       thr->task = task;
       gomp_end_task ();
       free (task);
-      thr->task = &team->implicit_task[0];
+      thr->task = &implicit_task[0];
     }
 #ifdef LIBGOMP_USE_PTHREADS
   else
--- libgomp/team.c.jj	2019-03-28 14:54:51.425124254 +0100
+++ libgomp/team.c	2019-09-02 12:25:37.511318624 +0200
@@ -170,14 +170,15 @@ gomp_new_team (unsigned nthreads)
   if (team == NULL)
     {
       size_t extra = sizeof (team->ordered_release[0])
-		     + sizeof (team->implicit_task[0]);
+		     + sizeof (struct gomp_taskqueue)
+		     + sizeof (team->taskqueues_and_implicit_tasks[0]);
       team = gomp_malloc (sizeof (*team) + nthreads * extra);
 
 #ifndef HAVE_SYNC_BUILTINS
       gomp_mutex_init (&team->work_share_list_free_lock);
 #endif
       gomp_barrier_init (&team->barrier, nthreads);
-      gomp_mutex_init (&team->task_lock);
+      gomp_mutex_init (&team->barrier_lock);
 
       team->nthreads = nthreads;
     }
@@ -195,17 +196,24 @@ gomp_new_team (unsigned nthreads)
     team->work_shares[i].next_free = &team->work_shares[i + 1];
   team->work_shares[i].next_free = NULL;
 
-  gomp_sem_init (&team->master_release, 0);
-  team->ordered_release = (void *) &team->implicit_task[nthreads];
-  team->ordered_release[0] = &team->master_release;
-
-  priority_queue_init (&team->task_queue);
+  team->num_taskqueue = nthreads;
+  struct gomp_taskqueue *queues = gomp_team_taskqueue (team);
+  for (i = 0; i < team->num_taskqueue; i++)
+    {
+      priority_queue_init (&queues[i].priority_queue);
+      gomp_mutex_init (&queues[i].queue_lock);
+    }
   team->task_count = 0;
   team->task_queued_count = 0;
   team->task_running_count = 0;
   team->work_share_cancelled = 0;
   team->team_cancelled = 0;
 
+  struct gomp_task *implicit_task = gomp_team_implicit_task (team);
+  gomp_sem_init (&team->master_release, 0);
+  team->ordered_release = (void *) &implicit_task[nthreads];
+  team->ordered_release[0] = &team->master_release;
+
   return team;
 }
 
@@ -219,8 +227,15 @@ free_team (struct gomp_team *team)
   gomp_mutex_destroy (&team->work_share_list_free_lock);
 #endif
   gomp_barrier_destroy (&team->barrier);
-  gomp_mutex_destroy (&team->task_lock);
-  priority_queue_free (&team->task_queue);
+  gomp_mutex_destroy (&team->barrier_lock);
+
+  struct gomp_taskqueue *queues = gomp_team_taskqueue (team);
+  for (int i = 0; i < team->num_taskqueue; ++i)
+    {
+      priority_queue_free (&queues[i].priority_queue);
+      gomp_mutex_destroy (&queues[i].queue_lock);
+    }
+
   free (team);
 }
 
@@ -349,7 +364,7 @@ gomp_team_start (void (*fn) (void *), vo
   thr->ts.single_count = 0;
 #endif
   thr->ts.static_trip = 0;
-  thr->task = &team->implicit_task[0];
+  thr->task = &gomp_team_implicit_task (team)[0];
 #ifdef GOMP_NEEDS_THREAD_HANDLE
   thr->handle = pthread_self ();
 #endif
@@ -366,8 +381,10 @@ gomp_team_start (void (*fn) (void *), vo
     bind_var = gomp_bind_var_list[thr->ts.level];
   gomp_init_task (thr->task, task, icv);
   thr->task->taskgroup = taskgroup;
-  team->implicit_task[0].icv.nthreads_var = nthreads_var;
-  team->implicit_task[0].icv.bind_var = bind_var;
+
+  struct gomp_task *implicit_task = gomp_team_implicit_task (team);
+  implicit_task[0].icv.nthreads_var = nthreads_var;
+  implicit_task[0].icv.bind_var = bind_var;
 
   if (nthreads == 1)
     return;
@@ -637,11 +654,11 @@ gomp_team_start (void (*fn) (void *), vo
 	  nthr->ts.single_count = 0;
 #endif
 	  nthr->ts.static_trip = 0;
-	  nthr->task = &team->implicit_task[i];
+	  nthr->task = &implicit_task[i];
 	  nthr->place = place;
 	  gomp_init_task (nthr->task, task, icv);
-	  team->implicit_task[i].icv.nthreads_var = nthreads_var;
-	  team->implicit_task[i].icv.bind_var = bind_var;
+	  implicit_task[i].icv.nthreads_var = nthreads_var;
+	  implicit_task[i].icv.bind_var = bind_var;
 	  nthr->task->taskgroup = taskgroup;
 	  nthr->fn = fn;
 	  nthr->data = data;
@@ -824,10 +841,10 @@ gomp_team_start (void (*fn) (void *), vo
       start_data->ts.single_count = 0;
 #endif
       start_data->ts.static_trip = 0;
-      start_data->task = &team->implicit_task[i];
+      start_data->task = &implicit_task[i];
       gomp_init_task (start_data->task, task, icv);
-      team->implicit_task[i].icv.nthreads_var = nthreads_var;
-      team->implicit_task[i].icv.bind_var = bind_var;
+      implicit_task[i].icv.nthreads_var = nthreads_var;
+      implicit_task[i].icv.bind_var = bind_var;
       start_data->task->taskgroup = taskgroup;
       start_data->thread_pool = pool;
       start_data->nested = nested;
--- libgomp/libgomp.h.jj	2019-05-14 21:37:35.503357563 +0200
+++ libgomp/libgomp.h	2019-09-02 13:02:20.187427096 +0200
@@ -405,7 +405,23 @@ enum gomp_task_kind
      but not yet completed.  Once that completes, they will be readded
      into the queues as GOMP_TASK_WAITING in order to perform the var
      unmapping.  */
-  GOMP_TASK_ASYNC_RUNNING
+  GOMP_TASK_ASYNC_RUNNING,
+};
+
+/* States are used to track who and when a task should be freed.  */
+enum gomp_task_state
+{
+  /* Some tasks are depending on this task.  This task thus shouldn't be freed
+     yet before the children mark is as GOMP_STATE_FREE.  */
+  GOMP_STATE_NORMAL,
+
+  /* The task is done executing.  It can be freed once no thread is waiting
+     in its semaphore and no children are left.  */
+  GOMP_STATE_DONE,
+
+  /* No thread is waiting on the task's semaphore and no children are left.
+     The task is free to go.  */
+  GOMP_STATE_CRITICAL,
 };
 
 struct gomp_task_depend_entry
@@ -429,31 +445,25 @@ struct gomp_dependers_vec
   struct gomp_task *elem[];
 };
 
-/* Used when in GOMP_taskwait or in gomp_task_maybe_wait_for_dependencies.  */
-
-struct gomp_taskwait
-{
-  bool in_taskwait;
-  bool in_depend_wait;
-  /* Number of tasks we are waiting for.  */
-  size_t n_depend;
-  gomp_sem_t taskwait_sem;
-};
-
 /* This structure describes a "task" to be run by a thread.  */
 
 struct gomp_task
 {
   /* Parent of this task.  */
   struct gomp_task *parent;
-  /* Children of this task.  */
-  struct priority_queue children_queue;
   /* Taskgroup this task belongs in.  */
   struct gomp_taskgroup *taskgroup;
   /* Tasks that depend on this task.  */
   struct gomp_dependers_vec *dependers;
   struct htab *depend_hash;
   struct gomp_taskwait *taskwait;
+
+  /* Mutex for protecting the dependency hash table and the lifecycle of the
+     task.  The lock is taken whenever dependencies are updated and the
+     a task lifecycle related critical section is entered (e.g. num_children
+     becomes 0).  */
+  gomp_mutex_t depend_lock;
+
   /* Number of items in DEPEND.  */
   size_t depend_count;
   /* Number of tasks this task depends on.  Once this counter reaches
@@ -461,18 +471,23 @@ struct gomp_task
      into the various queues to be scheduled.  */
   size_t num_dependees;
 
+  /* Number of unmet dependencies.  Only used in
+     gomp_task_maybe_wait_for_dependencies.  */
+  int num_awaited;
+
+  /* Number of childrens created and queued from this task.  */
+  size_t num_children;
+
   /* Priority of this task.  */
   int priority;
-  /* The priority node for this task in each of the different queues.
-     We put this here to avoid allocating space for each priority
-     node.  Then we play offsetof() games to convert between pnode[]
-     entries and the gomp_task in which they reside.  */
-  struct priority_node pnode[3];
+  /* The priority node for this task.  */
+  struct priority_node pnode;
 
   struct gomp_task_icv icv;
   void (*fn) (void *);
   void *fn_data;
   enum gomp_task_kind kind;
+  enum gomp_task_state state;
   bool in_tied_task;
   bool final_task;
   bool copy_ctors_done;
@@ -490,8 +505,6 @@ struct gomp_task
 struct gomp_taskgroup
 {
   struct gomp_taskgroup *prev;
-  /* Queue of tasks that belong in this taskgroup.  */
-  struct priority_queue taskgroup_queue;
   uintptr_t *reductions;
   bool in_taskgroup_wait;
   bool cancelled;
@@ -530,6 +543,17 @@ struct gomp_target_task
   void *hostaddrs[];
 };
 
+/* This structure describes the queues containing the tasks to be
+   executed.  */
+
+struct gomp_taskqueue
+{
+  struct priority_queue priority_queue;
+
+  /* Lock protecting this specific queue.  */
+  gomp_mutex_t queue_lock;
+};
+
 /* This structure describes a "team" of threads.  These are the threads
    that are spawned by a PARALLEL constructs, as well as the work sharing
    constructs that the team encounters.  */
@@ -587,13 +611,13 @@ struct gomp_team
   /* This barrier is used for most synchronization of the team.  */
   gomp_barrier_t barrier;
 
+  /* Lock for synchronizing barrier state.  */
+  gomp_mutex_t barrier_lock;
+
   /* Initial work shares, to avoid allocating any gomp_work_share
      structs in the common case.  */
   struct gomp_work_share work_shares[8];
 
-  gomp_mutex_t task_lock;
-  /* Scheduled tasks.  */
-  struct priority_queue task_queue;
   /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team.  */
   unsigned int task_count;
   /* Number of GOMP_TASK_WAITING tasks currently waiting to be scheduled.  */
@@ -609,8 +633,13 @@ struct gomp_team
   int work_share_cancelled;
   int team_cancelled;
 
-  /* This array contains structures for implicit tasks.  */
-  struct gomp_task implicit_task[];
+  /* Number of taskqueues.  */
+  unsigned int num_taskqueue;
+
+  /* This array contains the taskqueues and the structures for implicit tasks.
+     The implicit tasks start from
+     &implicit_task + sizeof (gomp_taskqueue) * num_taskqueue.  */
+  struct gomp_task taskqueues_and_implicit_tasks[];
 };
 
 /* This structure contains all data that is private to libgomp and is
@@ -829,6 +858,10 @@ extern bool gomp_create_target_task (str
 				     enum gomp_target_task_state);
 extern struct gomp_taskgroup *gomp_parallel_reduction_register (uintptr_t *,
 								unsigned);
+extern void gomp_enqueue_task (struct gomp_task *task, struct gomp_team *team,
+			       struct gomp_thread *thread, int priority);
+extern struct gomp_task *gomp_dequeue_task (struct gomp_team *team,
+					    struct gomp_thread *thread);
 extern void gomp_workshare_taskgroup_start (void);
 extern void gomp_workshare_task_reduction_register (uintptr_t *, uintptr_t *);
 
@@ -837,6 +870,7 @@ gomp_finish_task (struct gomp_task *task
 {
   if (__builtin_expect (task->depend_hash != NULL, 0))
     free (task->depend_hash);
+  gomp_mutex_destroy (&task->depend_lock);
 }
 
 /* team.c */
@@ -1203,34 +1237,35 @@ extern int gomp_test_nest_lock_25 (omp_n
 #endif
 
 /* Helper function for priority_node_to_task() and
-   task_to_priority_node().
-
-   Return the offset from a task to its priority_node entry.  The
-   priority_node entry is has a type of TYPE.  */
+   task_to_priority_node().  */
 
-static inline size_t
-priority_queue_offset (enum priority_queue_type type)
+static inline struct gomp_task *
+priority_node_to_task (struct priority_node *node)
 {
-  return offsetof (struct gomp_task, pnode[(int) type]);
+  return ((struct gomp_task *)
+	  ((char *) node - offsetof (struct gomp_task, pnode)));
 }
 
-/* Return the task associated with a priority NODE of type TYPE.  */
-
-static inline struct gomp_task *
-priority_node_to_task (enum priority_queue_type type,
-		       struct priority_node *node)
+static inline struct priority_node *
+task_to_priority_node (struct gomp_task *task)
 {
-  return (struct gomp_task *) ((char *) node - priority_queue_offset (type));
+  return ((struct priority_node *)
+	  ((char *) task + offsetof (struct gomp_task, pnode)));
 }
 
-/* Return the priority node of type TYPE for a given TASK.  */
+static inline struct gomp_taskqueue *
+gomp_team_taskqueue (struct gomp_team *team)
+{
+  return (struct gomp_taskqueue *) &team->taskqueues_and_implicit_tasks;
+}
 
-static inline struct priority_node *
-task_to_priority_node (enum priority_queue_type type,
-		       struct gomp_task *task)
+static inline struct gomp_task *
+gomp_team_implicit_task (struct gomp_team *team)
 {
-  return (struct priority_node *) ((char *) task
-				   + priority_queue_offset (type));
+
+  return ((struct gomp_task *)
+	  &((struct gomp_taskqueue *)
+	    &team->taskqueues_and_implicit_tasks)[team->num_taskqueue]);
 }
 
 #ifdef LIBGOMP_USE_PTHREADS
--- libgomp/taskloop.c.jj	2019-01-01 12:38:37.837651197 +0100
+++ libgomp/taskloop.c	2019-09-02 12:35:22.785583405 +0200
@@ -216,21 +216,13 @@ GOMP_taskloop (void (*fn) (void *), void
 		task_step -= step;
 	      fn (arg);
 	      arg += arg_size;
-	      if (!priority_queue_empty_p (&task[i].children_queue,
-					   MEMMODEL_RELAXED))
-		{
-		  gomp_mutex_lock (&team->task_lock);
-		  gomp_clear_parent (&task[i].children_queue);
-		  gomp_mutex_unlock (&team->task_lock);
-		}
-	      gomp_end_task ();
+	      gomp_maybe_end_task ();
 	    }
 	}
       else
 	for (i = 0; i < num_tasks; i++)
 	  {
 	    struct gomp_task task;
-
 	    gomp_init_task (&task, thr->task, gomp_icv (false));
 	    task.priority = priority;
 	    task.kind = GOMP_TASK_UNDEFERRED;
@@ -248,14 +240,7 @@ GOMP_taskloop (void (*fn) (void *), void
 	    if (i == nfirst)
 	      task_step -= step;
 	    fn (data);
-	    if (!priority_queue_empty_p (&task.children_queue,
-					 MEMMODEL_RELAXED))
-	      {
-		gomp_mutex_lock (&team->task_lock);
-		gomp_clear_parent (&task.children_queue);
-		gomp_mutex_unlock (&team->task_lock);
-	      }
-	    gomp_end_task ();
+	    gomp_maybe_end_task ();
 	  }
     }
   else
@@ -264,7 +249,6 @@ GOMP_taskloop (void (*fn) (void *), void
       struct gomp_task *parent = thr->task;
       struct gomp_taskgroup *taskgroup = parent->taskgroup;
       char *arg;
-      int do_wake;
       unsigned long i;
 
       for (i = 0; i < num_tasks; i++)
@@ -298,7 +282,6 @@ GOMP_taskloop (void (*fn) (void *), void
 	  task->fn_data = arg;
 	  task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1;
 	}
-      gomp_mutex_lock (&team->task_lock);
       /* If parallel or taskgroup has been cancelled, don't start new
 	 tasks.  */
       if (__builtin_expect (gomp_cancel_var, 0)
@@ -307,7 +290,6 @@ GOMP_taskloop (void (*fn) (void *), void
 	  if (gomp_team_barrier_cancelled (&team->barrier))
 	    {
 	    do_cancel:
-	      gomp_mutex_unlock (&team->task_lock);
 	      for (i = 0; i < num_tasks; i++)
 		{
 		  gomp_finish_task (tasks[i]);
@@ -319,50 +301,46 @@ GOMP_taskloop (void (*fn) (void *), void
 	    }
 	  if (taskgroup)
 	    {
-	      if (taskgroup->cancelled)
+	      if (__atomic_load_n (&taskgroup->cancelled, MEMMODEL_ACQUIRE))
 		goto do_cancel;
 	      if (taskgroup->workshare
 		  && taskgroup->prev
-		  && taskgroup->prev->cancelled)
+		  && __atomic_load_n (&taskgroup->prev->cancelled,
+				      MEMMODEL_ACQUIRE))
 		goto do_cancel;
 	    }
 	}
+
       if (taskgroup)
-	taskgroup->num_children += num_tasks;
+	__atomic_add_fetch (&taskgroup->num_children, num_tasks,
+			    MEMMODEL_ACQ_REL);
+      __atomic_add_fetch (&parent->num_children, num_tasks, MEMMODEL_ACQ_REL);
+      __atomic_add_fetch (&team->task_count, num_tasks, MEMMODEL_ACQ_REL);
+
       for (i = 0; i < num_tasks; i++)
 	{
 	  struct gomp_task *task = tasks[i];
-	  priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
-				 task, priority,
-				 PRIORITY_INSERT_BEGIN,
-				 /*last_parent_depends_on=*/false,
-				 task->parent_depends_on);
-	  if (taskgroup)
-	    priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
-				   task, priority, PRIORITY_INSERT_BEGIN,
-				   /*last_parent_depends_on=*/false,
-				   task->parent_depends_on);
-	  priority_queue_insert (PQ_TEAM, &team->task_queue, task, priority,
-				 PRIORITY_INSERT_END,
-				 /*last_parent_depends_on=*/false,
-				 task->parent_depends_on);
-	  ++team->task_count;
-	  ++team->task_queued_count;
+	  gomp_enqueue_task (task, team, thr, task->priority);
 	}
+
+      gomp_mutex_lock (&team->barrier_lock);
       gomp_team_barrier_set_task_pending (&team->barrier);
-      if (team->task_running_count + !parent->in_tied_task
+      gomp_mutex_unlock (&team->barrier_lock);
+
+      if (__atomic_load_n (&team->task_running_count, MEMMODEL_ACQUIRE)
+	    + !__atomic_load_n (&parent->in_tied_task, MEMMODEL_ACQUIRE)
 	  < team->nthreads)
 	{
-	  do_wake = team->nthreads - team->task_running_count
-		    - !parent->in_tied_task;
+	  int do_wake
+	    = team->nthreads
+	      - __atomic_load_n (&team->task_running_count, MEMMODEL_ACQUIRE)
+	      - !__atomic_load_n (&parent->in_tied_task, MEMMODEL_ACQUIRE);
 	  if ((unsigned long) do_wake > num_tasks)
 	    do_wake = num_tasks;
+
+	  if (do_wake)
+	    gomp_team_barrier_wake (&team->barrier, do_wake);
 	}
-      else
-	do_wake = 0;
-      gomp_mutex_unlock (&team->task_lock);
-      if (do_wake)
-	gomp_team_barrier_wake (&team->barrier, do_wake);
     }
   if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
     ialias_call (GOMP_taskgroup_end) ();
--- libgomp/parallel.c.jj	2019-01-01 12:38:37.635654512 +0100
+++ libgomp/parallel.c	2019-09-02 12:25:37.472319206 +0200
@@ -252,9 +252,8 @@ GOMP_cancel (int which, bool do_cancel)
 	    taskgroup = taskgroup->prev;
 	  if (!taskgroup->cancelled)
 	    {
-	      gomp_mutex_lock (&team->task_lock);
-	      taskgroup->cancelled = true;
-	      gomp_mutex_unlock (&team->task_lock);
+	      __atomic_store_n (&taskgroup->cancelled, true,
+				MEMMODEL_RELEASE);
 	    }
 	}
       return true;
--- libgomp/target.c.jj	2019-08-08 08:37:11.707824235 +0200
+++ libgomp/target.c	2019-09-02 12:25:37.492318907 +0200
@@ -1757,14 +1757,16 @@ GOMP_target_ext (int device, void (*fn)
 	  thr->ts.single_count = 0;
 #endif
 	  thr->ts.static_trip = 0;
-	  thr->task = &team->implicit_task[0];
+
+	  struct gomp_task *implicit_task = gomp_team_implicit_task (team);
+	  thr->task = &implicit_task[0];
 	  gomp_init_task (thr->task, NULL, icv);
 	  if (task)
 	    {
 	      thr->task = task;
 	      gomp_end_task ();
 	      free (task);
-	      thr->task = &team->implicit_task[0];
+	      thr->task = &implicit_task[0];
 	    }
 	  else
 	    pthread_setspecific (gomp_thread_destructor, thr);
--- libgomp/priority_queue.c.jj	2019-01-01 12:38:37.739652806 +0100
+++ libgomp/priority_queue.c	2019-09-02 12:25:37.473319191 +0200
@@ -36,14 +36,13 @@
    TYPE is the type of priority queue this task resides in.  */
 
 static inline bool
-priority_queue_task_in_list_p (enum priority_queue_type type,
-			       struct priority_list *list,
+priority_queue_task_in_list_p (struct priority_list *list,
 			       struct gomp_task *task)
 {
   struct priority_node *p = list->tasks;
   do
     {
-      if (priority_node_to_task (type, p) == task)
+      if (priority_node_to_task (p) == task)
 	return true;
       p = p->next;
     }
@@ -54,31 +53,29 @@ priority_queue_task_in_list_p (enum prio
 /* Tree version of priority_queue_task_in_list_p.  */
 
 static inline bool
-priority_queue_task_in_tree_p (enum priority_queue_type type,
-			       struct priority_queue *head,
+priority_queue_task_in_tree_p (struct priority_queue *head,
 			       struct gomp_task *task)
 {
   struct priority_list *list
     = priority_queue_lookup_priority (head, task->priority);
   if (!list)
     return false;
-  return priority_queue_task_in_list_p (type, list, task);
+  return priority_queue_task_in_list_p (list, task);
 }
 
 /* Generic version of priority_queue_task_in_list_p that works for
    trees or lists.  */
 
 bool
-priority_queue_task_in_queue_p (enum priority_queue_type type,
-				struct priority_queue *head,
+priority_queue_task_in_queue_p (struct priority_queue *head,
 				struct gomp_task *task)
 {
   if (priority_queue_empty_p (head, MEMMODEL_RELAXED))
     return false;
   if (priority_queue_multi_p (head))
-    return priority_queue_task_in_tree_p (type, head, task);
+    return priority_queue_task_in_tree_p (head, task);
   else
-    return priority_queue_task_in_list_p (type, &head->l, task);
+    return priority_queue_task_in_list_p (&head->l, task);
 }
 
 /* Sanity check LIST to make sure the tasks therein are in the right
@@ -93,15 +90,14 @@ priority_queue_task_in_queue_p (enum pri
    ensure that we are verifying the children queue.  */
 
 static void
-priority_list_verify (enum priority_queue_type type,
-		      struct priority_list *list, bool check_deps)
+priority_list_verify (struct priority_list *list, bool check_deps)
 {
   bool seen_tied = false;
   bool seen_plain_waiting = false;
   struct priority_node *p = list->tasks;
   while (1)
     {
-      struct gomp_task *t = priority_node_to_task (type, p);
+      struct gomp_task *t = priority_node_to_task (p);
       if (seen_tied && t->kind == GOMP_TASK_WAITING)
 	gomp_fatal ("priority_queue_verify: WAITING task after TIED");
       if (t->kind >= GOMP_TASK_TIED)
@@ -123,13 +119,6 @@ priority_list_verify (enum priority_queu
     }
 }
 
-/* Callback type for priority_tree_verify_callback.  */
-struct cbtype
-{
-  enum priority_queue_type type;
-  bool check_deps;
-};
-
 /* Verify every task in NODE.
 
    Callback for splay_tree_foreach.  */
@@ -137,8 +126,7 @@ struct cbtype
 static void
 priority_tree_verify_callback (prio_splay_tree_key key, void *data)
 {
-  struct cbtype *cb = (struct cbtype *) data;
-  priority_list_verify (cb->type, &key->l, cb->check_deps);
+  priority_list_verify (&key->l, *(bool *)data);
 }
 
 /* Generic version of priority_list_verify.
@@ -152,19 +140,17 @@ priority_tree_verify_callback (prio_spla
    ensure that we are verifying the children queue.  */
 
 void
-priority_queue_verify (enum priority_queue_type type,
-		       struct priority_queue *head, bool check_deps)
+priority_queue_verify (struct priority_queue *head, bool check_deps)
 {
   if (priority_queue_empty_p (head, MEMMODEL_RELAXED))
     return;
   if (priority_queue_multi_p (head))
     {
-      struct cbtype cb = { type, check_deps };
-      prio_splay_tree_foreach (&head->t,
-			       priority_tree_verify_callback, &cb);
+      prio_splay_tree_foreach (&head->t, priority_tree_verify_callback,
+			       &check_deps);
     }
   else
-    priority_list_verify (type, &head->l, check_deps);
+    priority_list_verify (&head->l, check_deps);
 }
 #endif /* _LIBGOMP_CHECKING_ */
 
@@ -172,8 +158,7 @@ priority_queue_verify (enum priority_que
    tree.  HEAD contains tasks of type TYPE.  */
 
 void
-priority_tree_remove (enum priority_queue_type type,
-		      struct priority_queue *head,
+priority_tree_remove (struct priority_queue *head,
 		      struct priority_node *node)
 {
   /* ?? The only reason this function is not inlined is because we
@@ -181,7 +166,7 @@ priority_tree_remove (enum priority_queu
      completely defined in the header file).  If the lack of inlining
      is a concern, we could pass the priority number as a
      parameter, or we could move this to libgomp.h.  */
-  int priority = priority_node_to_task (type, node)->priority;
+  int priority = priority_node_to_task (node)->priority;
 
   /* ?? We could avoid this lookup by keeping a pointer to the key in
      the priority_node.  */
@@ -214,16 +199,15 @@ priority_tree_remove (enum priority_queu
    in-order.  */
 
 static struct gomp_task *
-priority_tree_next_task_1 (enum priority_queue_type type,
-			   prio_splay_tree_node node)
+priority_tree_next_task_1 (prio_splay_tree_node node)
 {
  again:
   if (!node)
     return NULL;
-  struct gomp_task *ret = priority_tree_next_task_1 (type, node->right);
+  struct gomp_task *ret = priority_tree_next_task_1 (node->right);
   if (ret)
     return ret;
-  ret = priority_node_to_task (type, node->key.l.tasks);
+  ret = priority_node_to_task (node->key.l.tasks);
   if (ret->kind == GOMP_TASK_WAITING)
     return ret;
   node = node->left;
@@ -245,44 +229,9 @@ priority_tree_next_task_1 (enum priority
    TRUE, otherwise it is set to FALSE.  */
 
 struct gomp_task *
-priority_tree_next_task (enum priority_queue_type type1,
-			 struct priority_queue *q1,
-			 enum priority_queue_type type2,
-			 struct priority_queue *q2,
-			 bool *q1_chosen_p)
-{
-  struct gomp_task *t1 = priority_tree_next_task_1 (type1, q1->t.root);
-  if (!t1
-      /* Special optimization when only searching through one queue.  */
-      || !q2)
-    {
-      *q1_chosen_p = true;
-      return t1;
-    }
-  struct gomp_task *t2 = priority_tree_next_task_1 (type2, q2->t.root);
-  if (!t2 || t1->priority > t2->priority)
-    {
-      *q1_chosen_p = true;
-      return t1;
-    }
-  if (t2->priority > t1->priority)
-    {
-      *q1_chosen_p = false;
-      return t2;
-    }
-  /* If we get here, the priorities are the same, so we must look at
-     parent_depends_on to make our decision.  */
-#if _LIBGOMP_CHECKING_
-  if (t1 != t2)
-    gomp_fatal ("priority_tree_next_task: t1 != t2");
-#endif
-  if (t2->parent_depends_on && !t1->parent_depends_on)
-    {
-      *q1_chosen_p = false;
-      return t2;
-    }
-  *q1_chosen_p = true;
-  return t1;
+priority_tree_next_task (struct priority_queue *q)
+{
+  return priority_tree_next_task_1 (q->t.root);
 }
 
 /* Priority splay trees comparison function.  */
--- libgomp/priority_queue.h.jj	2019-01-01 12:38:37.860650820 +0100
+++ libgomp/priority_queue.h	2019-09-02 12:53:23.682442503 +0200
@@ -102,34 +102,14 @@ enum priority_insert_type {
   PRIORITY_INSERT_END
 };
 
-/* Used to determine in which queue a given priority node belongs in.
-   See pnode field of gomp_task.  */
-
-enum priority_queue_type
-{
-  PQ_TEAM,	    /* Node belongs in gomp_team's task_queue.  */
-  PQ_CHILDREN,	    /* Node belongs in parent's children_queue.  */
-  PQ_TASKGROUP,	    /* Node belongs in taskgroup->taskgroup_queue.  */
-  PQ_IGNORED = 999
-};
-
 /* Priority queue implementation prototypes.  */
-
-extern bool priority_queue_task_in_queue_p (enum priority_queue_type,
-					    struct priority_queue *,
+extern bool priority_queue_task_in_queue_p (struct priority_queue *,
 					    struct gomp_task *);
-extern void priority_queue_dump (enum priority_queue_type,
-				 struct priority_queue *);
-extern void priority_queue_verify (enum priority_queue_type,
-				   struct priority_queue *, bool);
-extern void priority_tree_remove (enum priority_queue_type,
-				  struct priority_queue *,
+extern void priority_queue_dump (struct priority_queue *);
+extern void priority_queue_verify (struct priority_queue *, bool);
+extern void priority_tree_remove (struct priority_queue *,
 				  struct priority_node *);
-extern struct gomp_task *priority_tree_next_task (enum priority_queue_type,
-						  struct priority_queue *,
-						  enum priority_queue_type,
-						  struct priority_queue *,
-						  bool *);
+extern struct gomp_task *priority_tree_next_task (struct priority_queue *);
 
 /* Return TRUE if there is more than one priority in HEAD.  This is
    used throughout to to choose between the fast path (priority 0 only
@@ -165,13 +145,10 @@ priority_queue_free (struct priority_que
 }
 
 /* Forward declarations.  */
-static inline size_t priority_queue_offset (enum priority_queue_type);
 static inline struct gomp_task *priority_node_to_task
-				(enum priority_queue_type,
-				 struct priority_node *);
+				(struct priority_node *);
 static inline struct priority_node *task_to_priority_node
-				    (enum priority_queue_type,
-				     struct gomp_task *);
+				    (struct gomp_task *);
 
 /* Return TRUE if priority queue HEAD is empty.
 
@@ -234,41 +211,19 @@ priority_queue_lookup_priority (struct p
    Return the new priority_node.  */
 
 static inline void
-priority_list_insert (enum priority_queue_type type,
-		      struct priority_list *list,
+priority_list_insert (struct priority_list *list,
 		      struct gomp_task *task,
 		      int priority,
 		      enum priority_insert_type pos,
-		      bool adjust_parent_depends_on,
 		      bool task_is_parent_depends_on)
 {
-  struct priority_node *node = task_to_priority_node (type, task);
+  struct priority_node *node = task_to_priority_node (task);
   if (list->tasks)
     {
-      /* If we are keeping track of higher/lower priority items,
-	 but this is a lower priority WAITING task
-	 (parent_depends_on != NULL), put it after all ready to
-	 run tasks.  See the comment in
-	 priority_queue_upgrade_task for a visual on how tasks
-	 should be organized.  */
-      if (adjust_parent_depends_on
-	  && pos == PRIORITY_INSERT_BEGIN
-	  && list->last_parent_depends_on
-	  && !task_is_parent_depends_on)
-	{
-	  struct priority_node *last_parent_depends_on
-	    = list->last_parent_depends_on;
-	  node->next = last_parent_depends_on->next;
-	  node->prev = last_parent_depends_on;
-	}
-      /* Otherwise, put it at the top/bottom of the queue.  */
-      else
-	{
-	  node->next = list->tasks;
-	  node->prev = list->tasks->prev;
-	  if (pos == PRIORITY_INSERT_BEGIN)
-	    list->tasks = node;
-	}
+      node->next = list->tasks;
+      node->prev = list->tasks->prev;
+      if (pos == PRIORITY_INSERT_BEGIN)
+	list->tasks = node;
       node->next->prev = node;
       node->prev->next = node;
     }
@@ -278,21 +233,15 @@ priority_list_insert (enum priority_queu
       node->prev = node;
       list->tasks = node;
     }
-  if (adjust_parent_depends_on
-      && list->last_parent_depends_on == NULL
-      && task_is_parent_depends_on)
-    list->last_parent_depends_on = node;
 }
 
 /* Tree version of priority_list_insert.  */
 
 static inline void
-priority_tree_insert (enum priority_queue_type type,
-		      struct priority_queue *head,
+priority_tree_insert (struct priority_queue *head,
 		      struct gomp_task *task,
 		      int priority,
 		      enum priority_insert_type pos,
-		      bool adjust_parent_depends_on,
 		      bool task_is_parent_depends_on)
 {
   if (__builtin_expect (head->t.root == NULL, 0))
@@ -324,84 +273,53 @@ priority_tree_insert (enum priority_queu
       prio_splay_tree_insert (&head->t, k);
       list = &k->key.l;
     }
-  priority_list_insert (type, list, task, priority, pos,
-			adjust_parent_depends_on,
+  priority_list_insert (list, task, priority, pos,
 			task_is_parent_depends_on);
 }
 
 /* Generic version of priority_*_insert.  */
 
 static inline void
-priority_queue_insert (enum priority_queue_type type,
-		       struct priority_queue *head,
+priority_queue_insert (struct priority_queue *head,
 		       struct gomp_task *task,
 		       int priority,
 		       enum priority_insert_type pos,
-		       bool adjust_parent_depends_on,
 		       bool task_is_parent_depends_on)
 {
 #if _LIBGOMP_CHECKING_
-  if (priority_queue_task_in_queue_p (type, head, task))
+  if (priority_queue_task_in_queue_p (head, task))
     gomp_fatal ("Attempt to insert existing task %p", task);
 #endif
   if (priority_queue_multi_p (head) || __builtin_expect (priority > 0, 0))
-    priority_tree_insert (type, head, task, priority, pos,
-			  adjust_parent_depends_on,
+    priority_tree_insert (head, task, priority, pos,
 			  task_is_parent_depends_on);
   else
-    priority_list_insert (type, &head->l, task, priority, pos,
-			  adjust_parent_depends_on,
+    priority_list_insert (&head->l, task, priority, pos,
 			  task_is_parent_depends_on);
 }
 
-/* If multiple priorities are in play, return the highest priority
-   task from within Q1 and Q2, while giving preference to tasks from
-   Q1.  If the returned task is chosen from Q1, *Q1_CHOSEN_P is set to
-   TRUE, otherwise it is set to FALSE.
-
-   If multiple priorities are not in play (only 0 priorities are
-   available), the next task is chosen exclusively from Q1.
-
-   As a special case, Q2 can be NULL, in which case, we just choose
-   the highest priority WAITING task in Q1.  This is an optimization
-   to speed up looking through only one queue.
-
-   We assume Q1 has at least one item.  */
+/* Choose highest priority item in q, which is assumed to be nonempty.  */
 
 static inline struct gomp_task *
-priority_queue_next_task (enum priority_queue_type t1,
-			  struct priority_queue *q1,
-			  enum priority_queue_type t2,
-			  struct priority_queue *q2,
-			  bool *q1_chosen_p)
+priority_queue_next_task (struct priority_queue *q)
 {
 #if _LIBGOMP_CHECKING_
-  if (priority_queue_empty_p (q1, MEMMODEL_RELAXED))
-    gomp_fatal ("priority_queue_next_task: Q1 is empty");
+  if (priority_queue_empty_p (q, MEMMODEL_RELAXED))
+    gomp_fatal ("priority_queue_next_task: queue is empty");
 #endif
-  if (priority_queue_multi_p (q1))
+  if (priority_queue_multi_p (q))
     {
-      struct gomp_task *t
-	= priority_tree_next_task (t1, q1, t2, q2, q1_chosen_p);
+      struct gomp_task *t = priority_tree_next_task (q);
       /* If T is NULL, there are no WAITING tasks in Q1.  In which
 	 case, return any old (non-waiting) task which will cause the
 	 caller to do the right thing when checking T->KIND ==
 	 GOMP_TASK_WAITING.  */
       if (!t)
-	{
-#if _LIBGOMP_CHECKING_
-	  if (*q1_chosen_p == false)
-	    gomp_fatal ("priority_queue_next_task inconsistency");
-#endif
-	  return priority_node_to_task (t1, q1->t.root->key.l.tasks);
-	}
+	  return priority_node_to_task (q->t.root->key.l.tasks);
       return t;
     }
   else
-    {
-      *q1_chosen_p = true;
-      return priority_node_to_task (t1, q1->l.tasks);
-    }
+      return priority_node_to_task (q->l.tasks);
 }
 
 /* Remove NODE from LIST.
@@ -454,18 +372,17 @@ remove_out:
    If the queue becomes empty after the remove, return TRUE.  */
 
 static inline bool
-priority_queue_remove (enum priority_queue_type type,
-		       struct priority_queue *head,
+priority_queue_remove (struct priority_queue *head,
 		       struct gomp_task *task,
 		       enum memmodel model)
 {
 #if _LIBGOMP_CHECKING_
-  if (!priority_queue_task_in_queue_p (type, head, task))
+  if (!priority_queue_task_in_queue_p (head, task))
     gomp_fatal ("Attempt to remove missing task %p", task);
 #endif
   if (priority_queue_multi_p (head))
     {
-      priority_tree_remove (type, head, task_to_priority_node (type, task));
+      priority_tree_remove (head, task_to_priority_node (task));
       if (head->t.root == NULL)
 	{
 	  if (model == MEMMODEL_RELEASE)
@@ -479,7 +396,7 @@ priority_queue_remove (enum priority_que
     }
   else
     return priority_list_remove (&head->l,
-				 task_to_priority_node (type, task), model);
+				 task_to_priority_node (task), model);
 }
 
 #endif /* _PRIORITY_QUEUE_H_ */


	Jakub

Attachment: PQ8
Description: Text document


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]