[gcc/devel/omp/gcc-13] [og13] OpenMP: Support accelerated 2D/3D memory copies for AMD GCN

Wed Sep 20 11:22:38 GMT 2023

https://gcc.gnu.org/g:8d3e8e7801140d9388a7a239fc29a636c4c1d673

commit 8d3e8e7801140d9388a7a239fc29a636c4c1d673
Author: Julian Brown <julian@codesourcery.com>
Date:   Wed Sep 13 13:31:48 2023 +0000

    [og13] OpenMP: Support accelerated 2D/3D memory copies for AMD GCN
    
    This patch adds support for 2D/3D memory copies for omp_target_memcpy_rect
    and "target update", using AMD extensions to the HSA API.  This follows
    Tobias's similar patch for NVPTX (already present on mainline and higher
    up this patch series for og13).
    
    Support is also added for 1-dimensional strided accesses: these are
    treated as a special case of 2-dimensional transfers, where the innermost
    dimension is formed from the stride length (in bytes).
    
    2023-09-19  Julian Brown  <julian@codesourcery.com>
    
    libgomp/
            * plugin/plugin-gcn.c (hsa_runtime_fn_info): Add hsa_amd_memory_lock_fn,
            hsa_amd_memory_unlock_fn, hsa_amd_memory_async_copy_rect_fn function
            pointers.
            (init_hsa_runtime_functions): Add above functions, with DLSYM_OPT_FN.
            (GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): New functions.
            * target.c (omp_target_memcpy_rect_worker): Add 1D strided transfer
            support.

Diff:
---
 libgomp/ChangeLog.omp       |  10 ++
 libgomp/plugin/plugin-gcn.c | 359 ++++++++++++++++++++++++++++++++++++++++++++
 libgomp/target.c            |  31 ++++
 3 files changed, 400 insertions(+)

diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
index 6b18e255fc7..56ad7c7c67a 100644
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,3 +1,13 @@
+2023-09-19  Julian Brown  <julian@codesourcery.com>
+
+	* plugin/plugin-gcn.c (hsa_runtime_fn_info): Add hsa_amd_memory_lock_fn,
+	hsa_amd_memory_unlock_fn, hsa_amd_memory_async_copy_rect_fn function
+	pointers.
+	(init_hsa_runtime_functions): Add above functions, with DLSYM_OPT_FN.
+	(GOMP_OFFLOAD_memcpy2d, GOMP_OFFLOAD_memcpy3d): New functions.
+	* target.c (omp_target_memcpy_rect_worker): Add 1D strided transfer
+	support.
+
 2023-09-19  Julian Brown  <julian@codesourcery.com>
 
 	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_memcpy2d): Adjust parameters to
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 7c0115ae579..cb016a2bfba 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -211,6 +211,16 @@ struct hsa_runtime_fn_info
   hsa_status_t (*hsa_amd_svm_attributes_set_fn)
     (void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
      size_t attribute_count);
+  hsa_status_t (*hsa_amd_memory_lock_fn)
+    (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
+     void **agent_ptr);
+  hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
+  hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
+    (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
+     const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
+     const hsa_dim3_t *range, hsa_agent_t copy_agent,
+     hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
+     const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
 };
 
 /* Structure describing the run-time and grid properties of an HSA kernel
@@ -1439,6 +1449,9 @@ init_hsa_runtime_functions (void)
   DLSYM_FN (hsa_queue_destroy)
   DLSYM_FN (hsa_code_object_deserialize)
   DLSYM_OPT_FN (hsa_amd_svm_attributes_set)
+  DLSYM_OPT_FN (hsa_amd_memory_lock)
+  DLSYM_OPT_FN (hsa_amd_memory_unlock)
+  DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
   return true;
 #undef DLSYM_FN
 }
@@ -3950,6 +3963,352 @@ GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
   return true;
 }
 
+/* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
+   measured in bytes.  So we have:
+
+   dim1_size: number of bytes to copy on innermost dimension ("row")
+   dim0_len: number of rows to copy
+   dst: base pointer for destination of copy
+   dst_offset1_size: innermost row offset (for dest), in bytes
+   dst_offset0_len: offset, number of rows (for dest)
+   dst_dim1_size: whole-array dest row length, in bytes (pitch)
+   src: base pointer for source of copy
+   src_offset1_size: innermost row offset (for source), in bytes
+   src_offset0_len: offset, number of rows (for source)
+   src_dim1_size: whole-array source row length, in bytes (pitch)
+*/
+
+int
+GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
+		       size_t dim0_len, void *dst, size_t dst_offset1_size,
+		       size_t dst_offset0_len, size_t dst_dim1_size,
+		       const void *src, size_t src_offset1_size,
+		       size_t src_offset0_len, size_t src_dim1_size)
+{
+  if (!hsa_fns.hsa_amd_memory_lock_fn
+      || !hsa_fns.hsa_amd_memory_unlock_fn
+      || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
+    return -1;
+
+  /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
+     out quietly if we have anything oddly-aligned rather than letting the
+     driver raise an error.  */
+  if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
+    return -1;
+
+  if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
+    return -1;
+
+  /* Only handle host to device or device to host transfers here.  */
+  if ((dst_ord == -1 && src_ord == -1)
+      || (dst_ord != -1 && src_ord != -1))
+    return -1;
+
+  hsa_amd_copy_direction_t dir
+    = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
+  hsa_agent_t copy_agent;
+
+  /* We need to pin (lock) host memory before we start the transfer.  Try to
+     lock the minimum size necessary, i.e. using partial first/last rows of the
+     whole array.  Something like this:
+
+	 rows -->
+	 ..............
+     c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
+     o | ++#######+++++ <- whole row
+     l | ++#######+++++ <-     "
+     s v ++#######..... <- last row apart from trailing remainder
+	 ..............
+
+     We could split very large transfers into several rectangular copies, but
+     that is unimplemented for now.  */
+
+  size_t bounded_size_host, first_elem_offset_host;
+  void *host_ptr;
+  if (dir == hsaHostToDevice)
+    {
+      bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
+      first_elem_offset_host = src_offset0_len * src_dim1_size
+			       + src_offset1_size;
+      host_ptr = (void *) src;
+      struct agent_info *agent = get_agent_info (dst_ord);
+      copy_agent = agent->id;
+    }
+  else
+    {
+      bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
+      first_elem_offset_host = dst_offset0_len * dst_dim1_size
+			       + dst_offset1_size;
+      host_ptr = dst;
+      struct agent_info *agent = get_agent_info (src_ord);
+      copy_agent = agent->id;
+    }
+
+  void *agent_ptr;
+
+  hsa_status_t status
+    = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
+				      bounded_size_host, NULL, 0, &agent_ptr);
+  /* We can't lock the host memory: don't give up though, we might still be
+     able to use the slow path in our caller.  So, don't make this an
+     error.  */
+  if (status != HSA_STATUS_SUCCESS)
+    return -1;
+
+  hsa_pitched_ptr_t dstpp, srcpp;
+  hsa_dim3_t dst_offsets, src_offsets, ranges;
+
+  int retval = 1;
+
+  hsa_signal_t completion_signal;
+  status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      retval = -1;
+      goto unlock;
+    }
+
+  if (dir == hsaHostToDevice)
+    {
+      srcpp.base = agent_ptr - first_elem_offset_host;
+      dstpp.base = dst;
+    }
+  else
+    {
+      srcpp.base = (void *) src;
+      dstpp.base = agent_ptr - first_elem_offset_host;
+    }
+
+  srcpp.pitch = src_dim1_size;
+  srcpp.slice = 0;
+
+  src_offsets.x = src_offset1_size;
+  src_offsets.y = src_offset0_len;
+  src_offsets.z = 0;
+
+  dstpp.pitch = dst_dim1_size;
+  dstpp.slice = 0;
+
+  dst_offsets.x = dst_offset1_size;
+  dst_offsets.y = dst_offset0_len;
+  dst_offsets.z = 0;
+
+  ranges.x = dim1_size;
+  ranges.y = dim0_len;
+  ranges.z = 1;
+
+  status
+    = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
+						 &src_offsets, &ranges,
+						 copy_agent, dir, 0, NULL,
+						 completion_signal);
+  /* If the rectangular copy fails, we might still be able to use the slow
+     path.  We need to unlock the host memory though, so don't return
+     immediately.  */
+  if (status != HSA_STATUS_SUCCESS)
+    retval = -1;
+  else
+    hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
+					HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
+					HSA_WAIT_STATE_ACTIVE);
+
+  hsa_fns.hsa_signal_destroy_fn (completion_signal);
+
+unlock:
+  status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not unlock host memory", status);
+
+  return retval;
+}
+
+/* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
+   measured in bytes.  So we have:
+
+   dim2_size: number of bytes to copy on innermost dimension ("row")
+   dim1_len: number of rows per slice to copy
+   dim0_len: number of slices to copy
+   dst: base pointer for destination of copy
+   dst_offset2_size: innermost row offset (for dest), in bytes
+   dst_offset1_len: offset, number of rows (for dest)
+   dst_offset0_len: offset, number of slices (for dest)
+   dst_dim2_size: whole-array dest row length, in bytes (pitch)
+   dst_dim1_len: whole-array number of rows in slice (for dest)
+   src: base pointer for source of copy
+   src_offset2_size: innermost row offset (for source), in bytes
+   src_offset1_len: offset, number of rows (for source)
+   src_offset0_len: offset, number of slices (for source)
+   src_dim2_size: whole-array source row length, in bytes (pitch)
+   src_dim1_len: whole-array number of rows in slice (for source)
+*/
+
+int
+GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
+		       size_t dim1_len, size_t dim0_len, void *dst,
+		       size_t dst_offset2_size, size_t dst_offset1_len,
+		       size_t dst_offset0_len, size_t dst_dim2_size,
+		       size_t dst_dim1_len, const void *src,
+		       size_t src_offset2_size, size_t src_offset1_len,
+		       size_t src_offset0_len, size_t src_dim2_size,
+		       size_t src_dim1_len)
+{
+  if (!hsa_fns.hsa_amd_memory_lock_fn
+      || !hsa_fns.hsa_amd_memory_unlock_fn
+      || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
+    return -1;
+
+  /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
+     out quietly if we have anything oddly-aligned rather than letting the
+     driver raise an error.  */
+  if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
+    return -1;
+
+  if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
+    return -1;
+
+  /* Only handle host to device or device to host transfers here.  */
+  if ((dst_ord == -1 && src_ord == -1)
+      || (dst_ord != -1 && src_ord != -1))
+    return -1;
+
+  hsa_amd_copy_direction_t dir
+    = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
+  hsa_agent_t copy_agent;
+
+  /* We need to pin (lock) host memory before we start the transfer.  Try to
+     lock the minimum size necessary, i.e. using partial first/last slices of
+     the whole 3D array.  Something like this:
+
+	 slice 0:          slice 1:          slice 2:
+	     __________        __________        __________
+       ^    /+++++++++/    :  /+++++++++/    :  /         /
+    column /+++##++++/|    | /+++##++++/|    | /+++##    /  # = subarray
+     /	  /   ##++++/ |    |/+++##++++/ |    |/+++##++++/   + = area to pin
+	 /_________/  :    /_________/  :    /_________/
+	row --->
+
+     We could split very large transfers into several rectangular copies, but
+     that is unimplemented for now.  */
+
+  size_t bounded_size_host, first_elem_offset_host;
+  void *host_ptr;
+  if (dir == hsaHostToDevice)
+    {
+      size_t slice_bytes = src_dim2_size * src_dim1_len;
+      bounded_size_host = slice_bytes * (dim0_len - 1)
+			  + src_dim2_size * (dim1_len - 1)
+			  + dim2_size;
+      first_elem_offset_host = src_offset0_len * slice_bytes
+			       + src_offset1_len * src_dim2_size
+			       + src_offset2_size;
+      host_ptr = (void *) src;
+      struct agent_info *agent = get_agent_info (dst_ord);
+      copy_agent = agent->id;
+    }
+  else
+    {
+      size_t slice_bytes = dst_dim2_size * dst_dim1_len;
+      bounded_size_host = slice_bytes * (dim0_len - 1)
+			  + dst_dim2_size * (dim1_len - 1)
+			  + dim2_size;
+      first_elem_offset_host = dst_offset0_len * slice_bytes
+			       + dst_offset1_len * dst_dim2_size
+			       + dst_offset2_size;
+      host_ptr = dst;
+      struct agent_info *agent = get_agent_info (src_ord);
+      copy_agent = agent->id;
+    }
+
+  void *agent_ptr;
+
+  hsa_status_t status
+    = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
+				      bounded_size_host, NULL, 0, &agent_ptr);
+  /* We can't lock the host memory: don't give up though, we might still be
+     able to use the slow path in our caller (maybe even with iterated memcpy2d
+     calls).  So, don't make this an error.  */
+  if (status != HSA_STATUS_SUCCESS)
+    return -1;
+
+  hsa_pitched_ptr_t dstpp, srcpp;
+  hsa_dim3_t dst_offsets, src_offsets, ranges;
+
+  int retval = 1;
+
+  hsa_signal_t completion_signal;
+  status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      retval = -1;
+      goto unlock;
+    }
+
+  if (dir == hsaHostToDevice)
+    {
+      srcpp.base = agent_ptr - first_elem_offset_host;
+      dstpp.base = dst;
+    }
+  else
+    {
+      srcpp.base = (void *) src;
+      dstpp.base = agent_ptr - first_elem_offset_host;
+    }
+
+  /* Pitch is measured in bytes.  */
+  srcpp.pitch = src_dim2_size;
+  /* Slice is also measured in bytes (i.e. total per-slice).  */
+  srcpp.slice = src_dim2_size * src_dim1_len;
+
+  src_offsets.x = src_offset2_size;
+  src_offsets.y = src_offset1_len;
+  src_offsets.z = src_offset0_len;
+
+  /* As above.  */
+  dstpp.pitch = dst_dim2_size;
+  dstpp.slice = dst_dim2_size * dst_dim1_len;
+
+  dst_offsets.x = dst_offset2_size;
+  dst_offsets.y = dst_offset1_len;
+  dst_offsets.z = dst_offset0_len;
+
+  ranges.x = dim2_size;
+  ranges.y = dim1_len;
+  ranges.z = dim0_len;
+
+  status
+    = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
+						 &src_offsets, &ranges,
+						 copy_agent, dir, 0, NULL,
+						 completion_signal);
+  /* If the rectangular copy fails, we might still be able to use the slow
+     path.  We need to unlock the host memory though, so don't return
+     immediately.  */
+  if (status != HSA_STATUS_SUCCESS)
+    retval = -1;
+  else
+    {
+      hsa_signal_value_t sv
+	= hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
+					      HSA_SIGNAL_CONDITION_LT, 1,
+					      UINT64_MAX,
+					      HSA_WAIT_STATE_ACTIVE);
+      if (sv < 0)
+	{
+	  GCN_WARNING ("async copy rect failure");
+	  retval = -1;
+	}
+    }
+
+  hsa_fns.hsa_signal_destroy_fn (completion_signal);
+
+unlock:
+  status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not unlock host memory", status);
+
+  return retval;
+}
+
 /* }}}  */
 /* {{{ OpenMP Plugin API  */
 
diff --git a/libgomp/target.c b/libgomp/target.c
index 6644b3399d3..e4c04e68995 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -6022,6 +6022,37 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
       if (__builtin_mul_overflow (span, strides[0], &stride))
 	return EINVAL;
 
+      if (((src_devicep && src_devicep->memcpy2d_func)
+	   || (dst_devicep && dst_devicep->memcpy2d_func))
+	  && (stride % element_size) == 0)
+	{
+	  /* Try using memcpy2d for a 1-dimensional strided access.  Here we
+	     treat the transfer as a 2-dimensional array, where the inner
+	     dimension is calculated to be (stride in bytes) / element_size.
+	     Indices/offsets are adjusted so the source/destination pointers
+	     point to the first element to be transferred, to make the sums
+	     easier.  (There are some configurations of 2D strided accesses
+	     that memcpy3d could handle similarly, but those are probably rare
+	     and are unimplemented for now.)   */
+
+	  /* If stride is element size, this is a contiguous transfer and
+	     should have been handled above.  */
+	  assert (stride > element_size);
+
+	  int dst_id = dst_devicep ? dst_devicep->target_id : -1;
+	  int src_id = src_devicep ? src_devicep->target_id : -1;
+	  void *subarray_src = (char *) src + src_off;
+	  void *subarray_dst = (char *) dst + dst_off;
+
+	  struct gomp_device_descr *devp = dst_devicep ? dst_devicep
+						       : src_devicep;
+	  ret = devp->memcpy2d_func (dst_id, src_id, element_size, volume[0],
+				     subarray_dst, 0, 0, stride, subarray_src,
+				     0, 0, stride);
+	  if (ret != -1)
+	    return ret ? 0 : EINVAL;
+	}
+
       for (i = 0, ret = 1; i < volume[0] && ret; i++)
 	{
 	  if (src_devicep == NULL)