This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[gomp-nvptx 13/13] libgomp plugin: handle multiple teams
- From: Alexander Monakov <amonakov at ispras dot ru>
- To: gcc-patches at gcc dot gnu dot org
- Date: Wed, 20 Jan 2016 20:27:07 +0300
- Subject: [gomp-nvptx 13/13] libgomp plugin: handle multiple teams
- Authentication-results: sourceware.org; auth=none
- References: <1453310827-23183-1-git-send-email-amonakov at ispras dot ru>
This complements multiple teams support on the libgomp plugin side.
* plugin/plugin-nvptx.c (struct targ_fn_descriptor): Add new fields.
(struct ptx_device): Ditto. Set them...
(nvptx_open_device): ...here.
(GOMP_OFFLOAD_load_image): Set new targ_fn_descriptor fields.
(nvptx_adjust_launch_bounds): New. Use it...
(GOMP_OFFLOAD_run): ...here.
---
libgomp/ChangeLog.gomp-nvptx | 9 ++++
libgomp/plugin/plugin-nvptx.c | 106 +++++++++++++++++++++++++++++++++++++++---
2 files changed, 109 insertions(+), 6 deletions(-)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 87e0494..b7bf59b 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -254,6 +254,8 @@ struct targ_fn_descriptor
{
CUfunction fn;
const struct targ_fn_launch *launch;
+ int regs_per_thread;
+ int max_threads_per_block;
};
/* A loaded PTX image. */
@@ -290,6 +292,9 @@ struct ptx_device
bool mkern;
int mode;
int clock_khz;
+ int num_sms;
+ int regs_per_block;
+ int regs_per_sm;
struct ptx_image_data *images; /* Images loaded on device. */
pthread_mutex_t image_lock; /* Lock for above list. */
@@ -648,6 +653,36 @@ nvptx_open_device (int n)
ptx_dev->clock_khz = pi;
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+ ptx_dev->num_sms = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+ dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+ ptx_dev->regs_per_block = pi;
+
+ /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
+ in CUDA 6.0 and newer. */
+ r = cuDeviceGetAttribute (&pi, 82, dev);
+ /* Fallback: use limit of registers per block, which is usually equal. */
+ if (r == CUDA_ERROR_INVALID_VALUE)
+ pi = ptx_dev->regs_per_block;
+ else if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+
+ ptx_dev->regs_per_sm = pi;
+
+ r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
+ if (pi != 32)
+ GOMP_PLUGIN_fatal ("Only warp size 32 is supported");
+
r = cuDeviceGetAttribute (&async_engines,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
if (r != CUDA_SUCCESS)
@@ -1589,13 +1624,23 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
{
CUfunction function;
+ int nregs, mthrs;
r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
+ r = cuFuncGetAttribute (&nregs, CU_FUNC_ATTRIBUTE_NUM_REGS, function);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuFuncGetAttribute error: %s", cuda_error (r));
+ r = cuFuncGetAttribute (&mthrs, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+ function);
+ if (r != CUDA_SUCCESS)
+ GOMP_PLUGIN_fatal ("cuFuncGetAttribute error: %s", cuda_error (r));
targ_fns->fn = function;
targ_fns->launch = &fn_descs[i];
+ targ_fns->regs_per_thread = nregs;
+ targ_fns->max_threads_per_block = mthrs;
targ_tbl->start = (uintptr_t) targ_fns;
targ_tbl->end = targ_tbl->start + 1;
@@ -1822,19 +1867,67 @@ GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
return nvptx_set_cuda_stream (async, stream);
}
+/* Adjust launch dimensions: pick good values for number of blocks and warps
+ and ensure that number of warps does not exceed CUDA limits as well as GCC's
+ own limits. */
+
+static void
+nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
+ struct ptx_device *ptx_dev,
+ long *teams_p, long *threads_p)
+{
+ int max_warps_block = fn->max_threads_per_block / 32;
+ /* Maximum 32 warps per block is an implementation limit in NVPTX backend
+ and libgcc, which matches documented limit of all GPUs as of 2015. */
+ if (max_warps_block > 32)
+ max_warps_block = 32;
+ if (*threads_p <= 0)
+ *threads_p = 8;
+ if (*threads_p > max_warps_block)
+ *threads_p = max_warps_block;
+
+ int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
+ /* This is an estimate of how many blocks the device can host simultaneously.
+ Actual limit, which may be lower, can be queried with "occupancy control"
+ driver interface (since CUDA 6.0). */
+ int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
+ if (*teams_p <= 0 || *teams_p > max_blocks)
+ *teams_p = max_blocks;
+}
+
void
-GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars)
+GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
{
CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
CUresult r;
struct ptx_device *ptx_dev = ptx_devices[ord];
const char *maybe_abort_msg = "(perhaps abort was called)";
- void *args = &tgt_vars;
+ void *fn_args = &tgt_vars;
+ long teams = 0, threads = 0;
+
+ if (!args)
+ GOMP_PLUGIN_fatal ("No target arguments provided");
+ while (*args)
+ {
+ long id = (long) *args++, val;
+ if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
+ val = (long) *args++;
+ else
+ val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
+ if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
+ continue;
+ id &= GOMP_TARGET_ARG_ID_MASK;
+ if (id == GOMP_TARGET_ARG_NUM_TEAMS)
+ teams = val;
+ else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
+ threads = val;
+ }
+ nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
r = cuLaunchKernel (function,
- 1, 1, 1,
- 32, 8, 1,
- 0, ptx_dev->null_stream->stream, &args, 0);
+ teams, 1, 1,
+ 32, threads, 1,
+ 0, ptx_dev->null_stream->stream, &fn_args, 0);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
@@ -1847,7 +1940,8 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars)
}
void
-GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void *async_data)
+GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
+ void *async_data)
{
GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
}