}
}
+/* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
+ host to device memory transfers. */
+
+struct gomp_coalesce_buf
+{
+ /* Buffer into which gomp_copy_host2dev will memcpy data and from which
+ it will be copied to the device. */
+ void *buf;
+ struct target_mem_desc *tgt;
+ /* Array with offsets, chunks[2 * i] is the starting offset and
+ chunks[2 * i + 1] ending offset relative to tgt->tgt_start device address
+ of chunks which are to be copied to buf and later copied to device. */
+ size_t *chunks;
+ /* Number of chunks in chunks array, or -1 if coalesce buffering should not
+ be performed. */
+ long chunk_cnt;
+ /* During construction of chunks array, how many memory regions are within
+ the last chunk. If there is just one memory region for a chunk, we copy
+ it directly to device rather than going through buf. */
+ long use_cnt;
+};
+
+/* Maximum size of memory region considered for coalescing. Larger copies
+ are performed directly. */
+#define MAX_COALESCE_BUF_SIZE (32 * 1024)
+
+/* Maximum size of a gap in between regions to consider them being copied
+ within the same chunk. All the device offsets considered are within
+ newly allocated device memory, so it isn't fatal if we copy some padding
+ in between from host to device. The gaps come either from alignment
+ padding or from memory regions which are not supposed to be copied from
+ host to device (e.g. map(alloc:), map(from:) etc.). */
+#define MAX_COALESCE_BUF_GAP (4 * 1024)
+
+/* Add region with device tgt_start relative offset and length to CBUF. */
+
+static inline void
+gomp_coalesce_buf_add (struct gomp_coalesce_buf *cbuf, size_t start, size_t len)
+{
+ if (len > MAX_COALESCE_BUF_SIZE || len == 0)
+ return;
+ if (cbuf->chunk_cnt)
+ {
+ if (cbuf->chunk_cnt < 0)
+ return;
+ if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
+ {
+ cbuf->chunk_cnt = -1;
+ return;
+ }
+ if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + MAX_COALESCE_BUF_GAP)
+ {
+ cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len;
+ cbuf->use_cnt++;
+ return;
+ }
+ /* If the last chunk is only used by one mapping, discard it,
+ as it will be one host to device copy anyway and
+ memcpying it around will only waste cycles. */
+ if (cbuf->use_cnt == 1)
+ cbuf->chunk_cnt--;
+ }
+ cbuf->chunks[2 * cbuf->chunk_cnt] = start;
+ cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len;
+ cbuf->chunk_cnt++;
+ cbuf->use_cnt = 1;
+}
+
+/* Return true for mapping kinds which need to copy data from the
+ host to device for regions that weren't previously mapped. */
+
+static inline bool
+gomp_to_device_kind_p (int kind)
+{
+ switch (kind)
+ {
+ case GOMP_MAP_ALLOC:
+ case GOMP_MAP_FROM:
+ case GOMP_MAP_FORCE_ALLOC:
+ case GOMP_MAP_ALWAYS_FROM:
+ return false;
+ default:
+ return true;
+ }
+}
+
static void
gomp_copy_host2dev (struct gomp_device_descr *devicep,
- void *d, const void *h, size_t sz)
+ void *d, const void *h, size_t sz,
+ struct gomp_coalesce_buf *cbuf)
{
+ if (cbuf)
+ {
+ uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
+ if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
+ {
+ long first = 0;
+ long last = cbuf->chunk_cnt - 1;
+ while (first <= last)
+ {
+ long middle = (first + last) >> 1;
+ if (cbuf->chunks[2 * middle + 1] <= doff)
+ first = middle + 1;
+ else if (cbuf->chunks[2 * middle] <= doff)
+ {
+ if (doff + sz > cbuf->chunks[2 * middle + 1])
+ gomp_fatal ("internal libgomp cbuf error");
+ memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]),
+ h, sz);
+ return;
+ }
+ else
+ last = middle - 1;
+ }
+ }
+ }
gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
}
static inline void
gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
splay_tree_key newn, struct target_var_desc *tgt_var,
- unsigned char kind)
+ unsigned char kind, struct gomp_coalesce_buf *cbuf)
{
tgt_var->key = oldn;
tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
(void *) (oldn->tgt->tgt_start + oldn->tgt_offset
+ newn->host_start - oldn->host_start),
(void *) newn->host_start,
- newn->host_end - newn->host_start);
+ newn->host_end - newn->host_start, cbuf);
if (oldn->refcount != REFCOUNT_INFINITY)
oldn->refcount++;
static void
gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
- uintptr_t target_offset, uintptr_t bias)
+ uintptr_t target_offset, uintptr_t bias,
+ struct gomp_coalesce_buf *cbuf)
{
struct gomp_device_descr *devicep = tgt->device_descr;
struct splay_tree_s *mem_map = &devicep->mem_map;
if (cur_node.host_start == (uintptr_t) NULL)
{
cur_node.tgt_offset = (uintptr_t) NULL;
- /* FIXME: see comment about coalescing host/dev transfers below. */
gomp_copy_host2dev (devicep,
(void *) (tgt->tgt_start + target_offset),
(void *) &cur_node.tgt_offset,
- sizeof (void *));
+ sizeof (void *), cbuf);
return;
}
/* Add bias to the pointer value. */
array section. Now subtract bias to get what we want
to initialize the pointer with. */
cur_node.tgt_offset -= bias;
- /* FIXME: see comment about coalescing host/dev transfers below. */
gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
- (void *) &cur_node.tgt_offset, sizeof (void *));
+ (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
}
static void
gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
size_t first, size_t i, void **hostaddrs,
- size_t *sizes, void *kinds)
+ size_t *sizes, void *kinds,
+ struct gomp_coalesce_buf *cbuf)
{
struct gomp_device_descr *devicep = tgt->device_descr;
struct splay_tree_s *mem_map = &devicep->mem_map;
&& n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
{
gomp_map_vars_existing (devicep, n2, &cur_node,
- &tgt->list[i], kind & typemask);
+ &tgt->list[i], kind & typemask, cbuf);
return;
}
if (sizes[i] == 0)
== n2->tgt_offset - n->tgt_offset)
{
gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
- kind & typemask);
+ kind & typemask, cbuf);
return;
}
}
&& n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
{
gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
- kind & typemask);
+ kind & typemask, cbuf);
return;
}
}
tgt->list_count = mapnum;
tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
tgt->device_descr = devicep;
+ struct gomp_coalesce_buf cbuf, *cbufp = NULL;
if (mapnum == 0)
{
tgt_align = sizeof (void *);
tgt_size = 0;
+ cbuf.chunks = NULL;
+ cbuf.chunk_cnt = -1;
+ cbuf.use_cnt = 0;
+ cbuf.buf = NULL;
+ if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
+ {
+ cbuf.chunks
+ = (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
+ cbuf.chunk_cnt = 0;
+ }
if (pragma_kind == GOMP_MAP_VARS_TARGET)
{
size_t align = 4 * sizeof (void *);
tgt_align = align;
tgt_size = mapnum * sizeof (void *);
+ cbuf.chunk_cnt = 1;
+ cbuf.use_cnt = 1 + (mapnum > 1);
+ cbuf.chunks[0] = 0;
+ cbuf.chunks[1] = tgt_size;
}
gomp_mutex_lock (&devicep->lock);
size_t align = (size_t) 1 << (kind >> rshift);
if (tgt_align < align)
tgt_align = align;
- tgt_size -= (uintptr_t) hostaddrs[first]
- - (uintptr_t) hostaddrs[i];
+ tgt_size -= (uintptr_t) hostaddrs[first] - cur_node.host_start;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
- tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
+ tgt_size += cur_node.host_end - cur_node.host_start;
not_found_cnt += last - i;
for (i = first; i <= last; i++)
- tgt->list[i].key = NULL;
+ {
+ tgt->list[i].key = NULL;
+ if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i)
+ & typemask))
+ gomp_coalesce_buf_add (&cbuf,
+ tgt_size - cur_node.host_end
+ + (uintptr_t) hostaddrs[i],
+ sizes[i]);
+ }
i--;
continue;
}
for (i = first; i <= last; i++)
gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
- sizes, kinds);
+ sizes, kinds, NULL);
i--;
continue;
}
if (tgt_align < align)
tgt_align = align;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
+ gomp_coalesce_buf_add (&cbuf, tgt_size,
+ cur_node.host_end - cur_node.host_start);
tgt_size += cur_node.host_end - cur_node.host_start;
has_firstprivate = true;
continue;
n = splay_tree_lookup (mem_map, &cur_node);
if (n && n->refcount != REFCOUNT_LINK)
gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
- kind & typemask);
+ kind & typemask, NULL);
else
{
tgt->list[i].key = NULL;
if (tgt_align < align)
tgt_align = align;
tgt_size = (tgt_size + align - 1) & ~(align - 1);
+ if (gomp_to_device_kind_p (kind & typemask))
+ gomp_coalesce_buf_add (&cbuf, tgt_size,
+ cur_node.host_end - cur_node.host_start);
tgt_size += cur_node.host_end - cur_node.host_start;
if ((kind & typemask) == GOMP_MAP_TO_PSET)
{
tgt->tgt_start = (uintptr_t) tgt->to_free;
tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1);
tgt->tgt_end = tgt->tgt_start + tgt_size;
+
+ if (cbuf.use_cnt == 1)
+ cbuf.chunk_cnt--;
+ if (cbuf.chunk_cnt > 0)
+ {
+ cbuf.buf
+ = malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]);
+ if (cbuf.buf)
+ {
+ cbuf.tgt = tgt;
+ cbufp = &cbuf;
+ }
+ }
}
else
{
len = sizes[i];
gomp_copy_host2dev (devicep,
(void *) (tgt->tgt_start + tgt_size),
- (void *) hostaddrs[i], len);
+ (void *) hostaddrs[i], len, cbufp);
tgt_size += len;
continue;
case GOMP_MAP_FIRSTPRIVATE_INT:
}
for (i = first; i <= last; i++)
gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
- sizes, kinds);
+ sizes, kinds, cbufp);
i--;
continue;
case GOMP_MAP_ALWAYS_POINTER:
+ cur_node.host_start
- n->host_start),
(void *) &cur_node.tgt_offset,
- sizeof (void *));
+ sizeof (void *), cbufp);
cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
+ cur_node.host_start - n->host_start;
continue;
splay_tree_key n = splay_tree_lookup (mem_map, k);
if (n && n->refcount != REFCOUNT_LINK)
gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
- kind & typemask);
+ kind & typemask, cbufp);
else
{
k->link_key = NULL;
case GOMP_MAP_FORCE_TOFROM:
case GOMP_MAP_ALWAYS_TO:
case GOMP_MAP_ALWAYS_TOFROM:
- /* FIXME: Perhaps add some smarts, like if copying
- several adjacent fields from host to target, use some
- host buffer to avoid sending each var individually. */
gomp_copy_host2dev (devicep,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
- k->host_end - k->host_start);
+ k->host_end - k->host_start, cbufp);
break;
case GOMP_MAP_POINTER:
gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
- k->tgt_offset, sizes[i]);
+ k->tgt_offset, sizes[i], cbufp);
break;
case GOMP_MAP_TO_PSET:
- /* FIXME: see above FIXME comment. */
gomp_copy_host2dev (devicep,
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
- k->host_end - k->host_start);
+ k->host_end - k->host_start, cbufp);
for (j = i + 1; j < mapnum; j++)
if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
k->tgt_offset
+ ((uintptr_t) hostaddrs[j]
- k->host_start),
- sizes[j]);
+ sizes[j], cbufp);
i++;
}
break;
(void *) (tgt->tgt_start
+ k->tgt_offset),
(void *) k->host_start,
- sizeof (void *));
+ sizeof (void *), cbufp);
break;
default:
gomp_mutex_unlock (&devicep->lock);
for (i = 0; i < mapnum; i++)
{
cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
- /* FIXME: see above FIXME comment. */
gomp_copy_host2dev (devicep,
(void *) (tgt->tgt_start + i * sizeof (void *)),
- (void *) &cur_node.tgt_offset, sizeof (void *));
+ (void *) &cur_node.tgt_offset, sizeof (void *),
+ cbufp);
}
}
+ if (cbufp)
+ {
+ long c = 0;
+ for (c = 0; c < cbuf.chunk_cnt; ++c)
+ gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cbuf.chunks[2 * c]),
+ (char *) cbuf.buf + (cbuf.chunks[2 * c] - cbuf.chunks[0]),
+ cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL);
+ free (cbuf.buf);
+ }
+
/* If the variable from "omp target enter data" map-list was already mapped,
tgt is not needed. Otherwise tgt will be freed by gomp_unmap_vars or
gomp_exit_data. */
size_t size = cur_node.host_end - cur_node.host_start;
if (GOMP_MAP_COPY_TO_P (kind & typemask))
- gomp_copy_host2dev (devicep, devaddr, hostaddr, size);
+ gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
if (GOMP_MAP_COPY_FROM_P (kind & typemask))
gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
}