[PATCH, OpenMP 5.0] Implement structure element mapping changes in 5.0

Jakub Jelinek jakub@redhat.com
Fri Oct 30 14:05:40 GMT 2020


On Mon, Oct 26, 2020 at 09:10:08AM +0100, Jakub Jelinek via Gcc-patches wrote:
> Yes, it is a QoI and it is important not to regress about that.
> Furthermore, the more we diverge from what the spec says, it will be harder
> for us to implement, not just now, but in the future too.
> What I wrote about the actual implementation is actually not accurate, we
> need the master and slaves to be the struct splay_tree_key_s objects.
> And that one already has the aux field that could be used for the slaves,
> so we could e.g. use another magic value of refcount, e.g. REFCOUNT_SLAVE
> ~(uintptr_t) 2, and in that case aux would point to the master
> splay_tree_key_s.
> 
> And the 
> "If the corresponding list item’s reference count was not already incremented because of the
> effect of a map clause on the construct then:
> a) The corresponding list item’s reference count is incremented by one;"
> and
> "If the map-type is not delete and the corresponding list item’s reference count is finite and
> was not already decremented because of the effect of a map clause on the construct then:
> a) The corresponding list item’s reference count is decremented by one;"
> rules we need to implement in any case, I don't see a way around that.
> The same list item can now be mapped (or unmapped) multiple times on the same
> construct.

To show up what exactly I meant, here is a proof of concept (but unfinished)
patch.
For OpenMP only (I believe OpenACC ATM doesn't have such concept of
structure sibling lists nor requirement as OpenMP 5.0 that on one construct
one refcount isn't incremented multiple times nor decremented multiple
times) it uses the dynamic_refcount field otherwise only used in OpenACC
for the structure sibling lists; in particular, all but the first mapping
in a structure sibling list will have refcount == REFCOUNT_SIBLING and
dynamic_refcount pointing to their master's refcount field.  And
the master has dynamic_refcount set to the number of REFCOUNT_SIBLING
following those.

In the patch I've only changed the construction of such splay_tree_keys
and changed gomp_exit_data to do deal with those (that is the very easy
part) plus implement the OpenMP 5.0 rule that one refcount isn't decremented
more than once.
What would need to be done is handle the rest, in particular (for OpenMP
only) adjust the refcount (splay_tree_key only, not target_mem_desc), such
that for the just created splay_tree_keys (refcount pointers in between
tgt->array and end of the array (perhaps we should add a field how many
elts the array has) it doesn't bump anything - just rely on the refcount = 1
we do elsewhere, and for other refcounts, if REFCOUNT_SIBLING, use the
dynamic_refcount pointer and if not REFCOUNT_INFINITY, instead of bumping
the refcount queue it for later increments (again, with allocaed list).
And when unmapping at the end of target or target data, do something similar
to what gomp_exit_data does in the patch (perhaps with some helper
functions).

At least from omp-lang discussions, the intent is that e.g. on
struct S { int a, b, c, d, e; } s = { 1, 2, 3, 4, 5};
#pragma omp target enter data map (s)
// same thing as
// #pragma omp target enter data map (s.a, s.b, s.c, s.d, s.e)
// The above at least theoretically creates 5 mappings, with
// refcount set to 1 for each (but with all those refcount behaving
// in sync), but I'd strongly prefer to create just one with one refcount.
int *p = &s.b;
int *q = &s.d;
#pragma omp target enter data map (p[:1]) map (q[:1])
// Above needs to bump either the refcounts of all of s.a, s.b, s.c, s.d and
// s.e by 1, or when it all has just a single refcount, bump it also just by
// 1.

int a;
#pragma omp target enter data map (a)   // This creates just one mapping and sets refcount to 1
// as int is not an aggregate
char *r, *s;
r = (char *) &a;
s = r + 2;
#pragma omp target enter data map (r[:1], s[:1])
// The above should bump the refcount of a just once, not twice in OpenMP
// 5.0.

For both testcases, I guess one can try to construct from that user
observable tests where the refcount will result in copying the data back at
certain points (or not).
And for the non-contiguous structure element mappings, the idea would
be that we still use a single refcount for the whole structure sibling list
defined in the spec.

--- libgomp/libgomp.h.jj	2020-10-30 12:57:16.176284101 +0100
+++ libgomp/libgomp.h	2020-10-30 12:57:40.264014514 +0100
@@ -1002,6 +1002,10 @@ struct target_mem_desc {
 /* Special value for refcount - tgt_offset contains target address of the
    artificial pointer to "omp declare target link" object.  */
 #define REFCOUNT_LINK (~(uintptr_t) 1)
+/* Special value for refcount - structure sibling list item other than
+   the first one.  *(uintptr_t *)dynamic_refcount is the actual refcount
+   for it.  */
+#define REFCOUNT_SIBLING (~(uintptr_t) 2)
 
 /* Special offset values.  */
 #define OFFSET_INLINED (~(uintptr_t) 0)
--- libgomp/target.c.jj	2020-10-30 12:57:19.926242130 +0100
+++ libgomp/target.c	2020-10-30 14:45:04.016809943 +0100
@@ -1022,6 +1022,7 @@ gomp_map_vars_internal (struct gomp_devi
       splay_tree_node array = tgt->array;
       size_t j, field_tgt_offset = 0, field_tgt_clear = FIELD_TGT_EMPTY;
       uintptr_t field_tgt_base = 0;
+      splay_tree_key field_tgt_sibling = NULL;
 
       for (i = 0; i < mapnum; i++)
 	if (has_always_ptrset
@@ -1153,6 +1154,7 @@ gomp_map_vars_internal (struct gomp_devi
 		    field_tgt_base = (uintptr_t) hostaddrs[first];
 		    field_tgt_offset = tgt_size;
 		    field_tgt_clear = last;
+		    field_tgt_sibling = NULL;
 		    tgt_size += cur_node.host_end
 				- (uintptr_t) hostaddrs[first];
 		    continue;
@@ -1251,12 +1253,29 @@ gomp_map_vars_internal (struct gomp_devi
 		size_t align = (size_t) 1 << (kind >> rshift);
 		tgt->list[i].key = k;
 		k->tgt = tgt;
+		k->refcount = 1;
+		k->dynamic_refcount = 0;
 		if (field_tgt_clear != FIELD_TGT_EMPTY)
 		  {
 		    k->tgt_offset = k->host_start - field_tgt_base
 				    + field_tgt_offset;
+		    if (pragma_kind != GOMP_MAP_VARS_OPENACC)
+		      {
+			if (field_tgt_sibling == 0)
+			  field_tgt_sibling = k;
+			else
+			  {
+			    k->refcount = REFCOUNT_SIBLING;
+			    k->dynamic_refcount
+			      = (uintptr_t) &field_tgt_sibling->refcount;
+			    field_tgt_sibling->dynamic_refcount++;
+			  }
+		      }
 		    if (i == field_tgt_clear)
-		      field_tgt_clear = FIELD_TGT_EMPTY;
+		      {
+			field_tgt_clear = FIELD_TGT_EMPTY;
+			field_tgt_sibling = 0;
+		      }
 		  }
 		else
 		  {
@@ -1270,8 +1289,6 @@ gomp_map_vars_internal (struct gomp_devi
 		tgt->list[i].is_attach = false;
 		tgt->list[i].offset = 0;
 		tgt->list[i].length = k->host_end - k->host_start;
-		k->refcount = 1;
-		k->dynamic_refcount = 0;
 		tgt->refcount++;
 		array->left = NULL;
 		array->right = NULL;
@@ -2462,12 +2479,27 @@ GOMP_target_update_ext (int device, size
   gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, true);
 }
 
+static int
+gomp_uintptr_t_cmp (const void *p1, const void *p2)
+{
+  if (*(const uintptr_t **) p1 < *(const uintptr_t **) p2)
+    return -1;
+  if (*(const uintptr_t **) p1 > *(const uintptr_t **) p2)
+    return 1;
+  return 0;
+}
+
 static void
 gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum,
 		void **hostaddrs, size_t *sizes, unsigned short *kinds)
 {
   const int typemask = 0xff;
   size_t i;
+  uintptr_t **refcounts = gomp_alloca (mapnum * sizeof (uintptr_t *));
+  size_t nrefcounts = 0;
+  splay_tree_key *keys = gomp_alloca (mapnum * sizeof (splay_tree_key));
+  bool any_deletes = false;
+  bool any_from = false;
   gomp_mutex_lock (&devicep->lock);
   if (devicep->state == GOMP_DEVICE_FINALIZED)
     {
@@ -2482,6 +2514,8 @@ gomp_exit_data (struct gomp_device_descr
       switch (kind)
 	{
 	case GOMP_MAP_FROM:
+	  any_from = true;
+	  /* FALLTHRU */
 	case GOMP_MAP_ALWAYS_FROM:
 	case GOMP_MAP_DELETE:
 	case GOMP_MAP_RELEASE:
@@ -2493,26 +2527,31 @@ gomp_exit_data (struct gomp_device_descr
 			      || kind == GOMP_MAP_ZERO_LEN_ARRAY_SECTION)
 	    ? gomp_map_0len_lookup (&devicep->mem_map, &cur_node)
 	    : splay_tree_lookup (&devicep->mem_map, &cur_node);
+	  keys[i] = k;
 	  if (!k)
 	    continue;
 
-	  if (k->refcount > 0 && k->refcount != REFCOUNT_INFINITY)
-	    k->refcount--;
-	  if ((kind == GOMP_MAP_DELETE
-	       || kind == GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION)
-	      && k->refcount != REFCOUNT_INFINITY)
-	    k->refcount = 0;
+	  if (k->refcount != REFCOUNT_INFINITY)
+	    {
+	      uintptr_t *refcount = &k->refcount;
+	      if (k->refcount == REFCOUNT_SIBLING)
+		refcount = (uintptr_t *)k->dynamic_refcount;
+	      if (kind == GOMP_MAP_DELETE
+		  || kind == GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION)
+		*refcount = 0;
+	      if (nrefcounts && refcounts[nrefcounts - 1] == refcount)
+		/* Already queued for refcount decrease.  */;
+	      else
+		/* Otherwise queue the mapping for refcount decrement.  */
+		refcounts[nrefcounts++] = refcount;
+	    }
 
-	  if ((kind == GOMP_MAP_FROM && k->refcount == 0)
-	      || kind == GOMP_MAP_ALWAYS_FROM)
+	  if (kind == GOMP_MAP_ALWAYS_FROM)
 	    gomp_copy_dev2host (devicep, NULL, (void *) cur_node.host_start,
 				(void *) (k->tgt->tgt_start + k->tgt_offset
 					  + cur_node.host_start
 					  - k->host_start),
 				cur_node.host_end - cur_node.host_start);
-	  if (k->refcount == 0)
-	    gomp_remove_var (devicep, k);
-
 	  break;
 	default:
 	  gomp_mutex_unlock (&devicep->lock);
@@ -2521,6 +2560,82 @@ gomp_exit_data (struct gomp_device_descr
 	}
     }
 
+  /* Sort the refcount pointers.  */
+  if (nrefcounts > 1)
+    qsort (refcounts, nrefcounts, sizeof (uintptr_t *), gomp_uintptr_t_cmp);
+
+  /* So that we can decrease each separate refcount just once.  */
+  for (i = 0; i < nrefcounts; i++)
+    if (i == 0 || refcounts[i - 1] != refcounts[i])
+      {
+	uintptr_t val = *refcounts[i];
+	if (val != 0)
+	  *refcounts[i] = val - 1;
+	if (val <= 1)
+	  any_deletes = true;
+      }
+
+  /* If anything has been decremented to 0 and there are from map-kind
+     map clauses, copy the data to host.  For always, from we've already
+     done it earlier.  */
+  if (any_deletes && any_from)
+    for (i = 0; i < mapnum; i++)
+      {
+	struct splay_tree_key_s cur_node;
+	unsigned char kind = kinds[i] & typemask;
+	if (kind == GOMP_MAP_FROM)
+	  {
+	    cur_node.host_start = (uintptr_t) hostaddrs[i];
+	    cur_node.host_end = cur_node.host_start + sizes[i];
+	    splay_tree_key k = keys[i];
+	    if (k == NULL || k->refcount == REFCOUNT_INFINITY)
+	      continue;
+
+	    uintptr_t *refcount = &k->refcount;
+	    if (k->refcount == REFCOUNT_SIBLING)
+	      refcount = (uintptr_t *) k->dynamic_refcount;
+
+	    if (*refcount == 0)
+	      gomp_copy_dev2host (devicep, NULL, (void *) cur_node.host_start,
+				  (void *) (k->tgt->tgt_start + k->tgt_offset
+					    + cur_node.host_start
+					    - k->host_start),
+				  cur_node.host_end - cur_node.host_start);
+	  }
+      }
+
+  /* And finally remove any mappings that reached refcount 0.  */
+  if (any_deletes)
+    for (i = 0; i < nrefcounts; i++)
+      if ((i == 0 || refcounts[i - 1] != refcounts[i])
+	  && *refcounts[i] == 0)
+	{
+	  splay_tree_key k
+	    = (splay_tree_key) ((char *) refcounts[i]
+				- offsetof (struct splay_tree_key_s,
+					    refcount));
+	  if (k->dynamic_refcount)
+	    {
+	      /* For OpenMP structure sibling lists, remove all following
+		 REFCOUNT_SIBLING mappings before finally removing the first
+		 one.  */
+	      splay_tree_key k2;
+	      for (k2 = k + 1; k->dynamic_refcount;
+		   k2++, k->dynamic_refcount--)
+		{
+		  if (k2->refcount != REFCOUNT_SIBLING
+		      || k2->dynamic_refcount != (uintptr_t) &k->refcount)
+		    {
+		      gomp_mutex_unlock (&devicep->lock);
+		      gomp_fatal ("internal error in structure sibling "
+				  "list handling");
+		    }
+		  gomp_remove_var (devicep, k2);
+		}
+	    }
+	  gomp_remove_var (devicep, k);
+	}
+
   gomp_mutex_unlock (&devicep->lock);
 }
 


	Jakub



More information about the Gcc-patches mailing list