[PATCH 2/6, OpenACC, libgomp] Async re-work, oacc-* parts (revised, v3)

Thomas Schwinge thomas@codesourcery.com
Fri Dec 28 14:52:00 GMT 2018


Hi Chung-Lin!

On Sat, 22 Dec 2018 00:04:56 +0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote:
> On 2018/12/19 5:03 AM, Thomas Schwinge wrote:
> > On Tue, 18 Dec 2018 23:06:38 +0800, Chung-Lin Tang <chunglin_tang@mentor.com> wrote:
> >> this part includes some of the lookup_goacc_asyncqueue fixes we talked about.
> >> I am still thinking about how the queue lock problem should really be solved, so regard
> >> this patch as just fixing some of the problems.
> 
> This is my solution to the queue lock stuff we talked about. I've only attached a diff to
> be applied atop of the existing changes, though that may actually be easier to review.
> 
> Note that this is still in testing, which means it hasn't been tested :P, but I'm
> posting to see if you have time to give it a look before the holidays.
> 
> Having the need for a lock on the async queues is quite irritating, especially
> when the structure needed for managing them is quite simple.
>
> Therefore, lets do away the need for locks entirely.

OK, if that's easily possible, fine.  Though, I won't object to the
current "lock" version, if done properly in all the relevant places, as
pointed out.

(And, as that was an open issue, as far as I remember: I think its fine
to have "aq"s not protected by the async lock, because they will only be
destroyed by device shutdown, and at that point, all async operations
must be synchronized with the local (host) thread, so the are then "aq"s
idle.)

> This patch makes the asyncqueue part of the device->openacc.async managed by lock-free
> atomic operations; almost all of the complexity is contained in lookup_goacc_asyncqueue(),
> so it should be not too complex. A descriptor and the queue array is allocated/exchanged
> atomically when more storage is required, while in the common case a simple lookup is enough.
> The fact that we manage asyncqueues by only expanding and never destroying asyncqueues
> during the device lifetime also simplifies many things.
> 
> The current implementation may be not that optimized and clunky in some cases, but I think
> this should be the way to implement what should be a fairly simple asyncqueue array and active
> list.

OK conceptually, but from the (very quick only!) look that I had, I did
not completely understand the data structure you're adding there, and
don't you also have to use atomic instructions for reading the lock-free
data structure ("aqdesc" etc.)?

> I'll update more on the status as testing proceeds.
> 
> (and about the other corners you noticed in the last mail, I'll get to that later...)

OK, thanks.


Grüße
 Thomas


> >> --- libgomp/oacc-async.c	(revision 267226)
> >> +++ libgomp/oacc-async.c	(working copy)
> > 
> >> +attribute_hidden struct goacc_asyncqueue *
> >> +lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
> >> +{
> >> +[...]
> >> +      /* Link new async queue into active list.  */
> >> +      goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
> >> +      n->aq = dev->openacc.async.asyncqueue[async];
> >> +      n->next = dev->openacc.async.active;
> >> +      dev->openacc.async.active = n;
> >> +    }
> >> +  gomp_mutex_unlock (&dev->openacc.async.lock);
> > 
> > You still need to keep "async" locked during...
> > 
> >> +  return dev->openacc.async.asyncqueue[async];
> > 
> > ... this dereference.
> > 
> >> +}

> --- trunk-orig/libgomp/libgomp.h	2018-12-20 23:24:20.040375724 +0800
> +++ trunk-work/libgomp/libgomp.h	2018-12-21 23:32:47.938628954 +0800
> @@ -937,6 +937,13 @@
>  
>  #include "splay-tree.h"
>  
> +struct goacc_asyncqueue_desc
> +{
> +  int nasyncqueue;
> +  struct goacc_asyncqueue **asyncqueue;
> +  struct goacc_asyncqueue_list *active;
> +};
> +
>  typedef struct acc_dispatch_t
>  {
>    /* This is a linked list of data mapped using the
> @@ -955,10 +962,7 @@
>      *destroy_thread_data_func;
>    
>    struct {
> -    gomp_mutex_t lock;
> -    int nasyncqueue;
> -    struct goacc_asyncqueue **asyncqueue;
> -    struct goacc_asyncqueue_list *active;
> +    struct goacc_asyncqueue_desc *aqdesc;
>  
>      __typeof (GOMP_OFFLOAD_openacc_async_construct) *construct_func;
>      __typeof (GOMP_OFFLOAD_openacc_async_destruct) *destruct_func;
> diff -ru trunk-orig/libgomp/oacc-async.c trunk-work/libgomp/oacc-async.c
> --- trunk-orig/libgomp/oacc-async.c	2018-12-18 22:19:51.923102938 +0800
> +++ trunk-work/libgomp/oacc-async.c	2018-12-21 23:40:36.088528890 +0800
> @@ -70,45 +70,84 @@
>  
>    struct gomp_device_descr *dev = thr->dev;
>  
> -  gomp_mutex_lock (&dev->openacc.async.lock);
> +  struct goacc_asyncqueue_desc *aqdesc = dev->openacc.async.aqdesc;
>  
> -  if (!create
> -      && (async >= dev->openacc.async.nasyncqueue
> -	  || !dev->openacc.async.asyncqueue[async]))
> -    {
> -      gomp_mutex_unlock (&dev->openacc.async.lock);
> -      return NULL;
> -    }
> +  if (!create)
> +    return async < aqdesc->nasyncqueue ? aqdesc->asyncqueue[async] : NULL;
>  
> -  if (async >= dev->openacc.async.nasyncqueue)
> -    {
> -      int diff = async + 1 - dev->openacc.async.nasyncqueue;
> -      dev->openacc.async.asyncqueue
> -	= gomp_realloc (dev->openacc.async.asyncqueue,
> -			sizeof (goacc_aq) * (async + 1));
> -      memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
> -	      0, sizeof (goacc_aq) * diff);
> -      dev->openacc.async.nasyncqueue = async + 1;
> -    }
> +  if (async < aqdesc->nasyncqueue && aqdesc->asyncqueue[async])
> +    return aqdesc->asyncqueue[async];
>  
> -  if (!dev->openacc.async.asyncqueue[async])
> -    {
> -      dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func ();
> +  struct goacc_asyncqueue *new_aq = dev->openacc.async.construct_func ();
> +  const int inc_size = 8;
> +  int new_sz = async + 1 + inc_size;
> +  new_sz = (new_sz + inc_size - 1) & ~(inc_size - 1);
> +
> +  struct goacc_asyncqueue **new_aqvec =
> +    gomp_malloc_cleared (sizeof (goacc_aq) * new_sz);
> +  new_aqvec[async] = new_aq;
> +
> +  struct goacc_asyncqueue_desc *ndesc =
> +    gomp_malloc (sizeof (struct goacc_asyncqueue_desc));
> +  ndesc->asyncqueue = new_aqvec;
> +  ndesc->nasyncqueue = new_sz;
> +
> +  goacc_aq_list nl = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
> +  nl->aq = new_aq;
> +  ndesc->active = nl;
> +
> +  /* Loop to atomically create and add a new async queue, without using a lock,
> +     though may possibly requiring creating and replacing with a new descriptor.
> +     This may likely not be as optimized as it can be, although simplifies
> +     other handling of async queues.  */
> +  do {
> +    int curr_num_aqs = aqdesc->nasyncqueue;
> +    
> +    if (async < curr_num_aqs)
> +      {
> +	/* Someone else expanded the asyncqeue array to enough size, free the
> +	   allocated resources.  */
> +	free (ndesc->asyncqueue);
> +	free (ndesc);
> +
> +	/* Try to set the new aq to the right place again.  */
> +	struct goacc_asyncqueue_desc *nullq = NULL;
> +	if (__atomic_compare_exchange (&aqdesc->asyncqueue[async],
> +				       &nullq, &new_aq, false,
> +				       __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE))
> +	  {
> +	    /* Push new asyncqueue to front of active list.  */
> +	    nl->next = aqdesc->active;
> +	    while (!__atomic_compare_exchange (&aqdesc->active, &nl->next, &nl,
> +					       true, __ATOMIC_ACQ_REL,
> +					       __ATOMIC_ACQUIRE))
> +	      ;
> +	  }
> +	else
> +	  {
> +	    /* Okay, someone else already created 'async', free/destroy
> +	       everything allocated and return existing queue directly.  */
> +	    if (!dev->openacc.async.destruct_func (new_aq))
> +	      gomp_fatal ("error in async queue creation");
> +	    free (nl);
> +	  }
> +	return aqdesc->asyncqueue[async];
> +      }
>  
> -      if (!dev->openacc.async.asyncqueue[async])
> -	{
> -	  gomp_mutex_unlock (&dev->openacc.async.lock);
> -	  gomp_fatal ("async %d creation failed", async);
> -	}
> -      
> -      /* Link new async queue into active list.  */
> -      goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
> -      n->aq = dev->openacc.async.asyncqueue[async];
> -      n->next = dev->openacc.async.active;
> -      dev->openacc.async.active = n;
> -    }
> -  gomp_mutex_unlock (&dev->openacc.async.lock);
> -  return dev->openacc.async.asyncqueue[async];
> +    /* Copy current asyncqueue contents*/
> +    memcpy (ndesc->asyncqueue, aqdesc->asyncqueue,
> +	    sizeof (struct goacc_asyncqueue *) * curr_num_aqs);
> +    ndesc->active->next = aqdesc->active;
> +
> +  } while (!__atomic_compare_exchange (&dev->openacc.async.aqdesc,
> +				       &aqdesc, &ndesc, false,
> +				       __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE));
> +
> +  /* Free the old stale descriptor and resources in aqdesc.  */
> +  free (aqdesc->asyncqueue);
> +  free (aqdesc);
> +
> +  return dev->openacc.async.aqdesc->asyncqueue[async];
>  }
>  
>  attribute_hidden struct goacc_asyncqueue *
> @@ -139,14 +178,12 @@
>    struct goacc_thread *thr = get_goacc_thread ();
>  
>    int ret = 1;
> -  gomp_mutex_lock (&thr->dev->openacc.async.lock);
> -  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
> +  for (goacc_aq_list l = thr->dev->openacc.async.aqdesc->active; l; l = l->next)
>      if (!thr->dev->openacc.async.test_func (l->aq))
>        {
>  	ret = 0;
>  	break;
>        }
> -  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
>    return ret;
>  }
>  
> @@ -194,10 +231,8 @@
>  {
>    struct gomp_device_descr *dev = get_goacc_thread_device ();
>  
> -  gomp_mutex_lock (&dev->openacc.async.lock);
> -  for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
> +  for (goacc_aq_list l = dev->openacc.async.aqdesc->active; l; l = l->next)
>      dev->openacc.async.synchronize_func (l->aq);
> -  gomp_mutex_unlock (&dev->openacc.async.lock);
>  }
>  
>  /* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all.  */
> @@ -221,14 +256,12 @@
>  
>    goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
>  
> -  gomp_mutex_lock (&thr->dev->openacc.async.lock);
> -  for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
> +  for (goacc_aq_list l = thr->dev->openacc.async.aqdesc->active; l; l = l->next)
>      {
>        thr->dev->openacc.async.synchronize_func (l->aq);
>        if (waiting_queue)
>  	thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
>      }
> -  gomp_mutex_unlock (&thr->dev->openacc.async.lock);
>  }
>  
>  int
> @@ -261,30 +294,27 @@
>  attribute_hidden void
>  goacc_init_asyncqueues (struct gomp_device_descr *devicep)
>  {
> -  gomp_mutex_init (&devicep->openacc.async.lock);
> -  devicep->openacc.async.nasyncqueue = 0;
> -  devicep->openacc.async.asyncqueue = NULL;
> -  devicep->openacc.async.active = NULL;
> +  devicep->openacc.async.aqdesc =
> +    gomp_malloc_cleared (sizeof (struct goacc_asyncqueue_desc));
>  }
>  
>  attribute_hidden bool
>  goacc_fini_asyncqueues (struct gomp_device_descr *devicep)
>  {
> +  struct goacc_asyncqueue_desc *aqdesc = devicep->openacc.async.aqdesc;
>    bool ret = true;
> -  if (devicep->openacc.async.nasyncqueue > 0)
> +
> +  if (aqdesc->nasyncqueue > 0)
>      {
>        goacc_aq_list next;
> -      for (goacc_aq_list l = devicep->openacc.async.active; l; l = next)
> +      for (goacc_aq_list l = aqdesc->active; l; l = next)
>  	{
>  	  ret &= devicep->openacc.async.destruct_func (l->aq);
>  	  next = l->next;
>  	  free (l);
>  	}
> -      free (devicep->openacc.async.asyncqueue);
> -      devicep->openacc.async.nasyncqueue = 0;
> -      devicep->openacc.async.asyncqueue = NULL;
> -      devicep->openacc.async.active = NULL;
> +      free (aqdesc->asyncqueue);
>      }
> -  gomp_mutex_destroy (&devicep->openacc.async.lock);
> +  free (aqdesc);
>    return ret;
>  }
> diff -ru trunk-orig/libgomp/oacc-cuda.c trunk-work/libgomp/oacc-cuda.c
> --- trunk-orig/libgomp/oacc-cuda.c	2018-12-18 18:09:08.601913507 +0800
> +++ trunk-work/libgomp/oacc-cuda.c	2018-12-21 23:34:10.024738834 +0800
> @@ -87,9 +87,7 @@
>    if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
>      {
>        goacc_aq aq = get_goacc_asyncqueue (async);
> -      gomp_mutex_lock (&thr->dev->openacc.async.lock);
>        ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
> -      gomp_mutex_unlock (&thr->dev->openacc.async.lock);
>      }
>  
>    return ret;



More information about the Gcc-patches mailing list