This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[og7] Adjust k80 resources
- From: Cesar Philippidis <cesar at codesourcery dot com>
- To: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Fri, 11 Aug 2017 12:38:14 -0700
- Subject: [og7] Adjust k80 resources
- Authentication-results: sourceware.org; auth=none
I've pushed this patch to openacc-gcc-7-branch to teach the libgomp
nvptx plugin how to cope with the hardware resources on K80 boards. K80
boards have two physical GPUs on a single board. Consequently, the CUDA
driver reports that 2x the amount of registers and shared memory are
available on those GPUs. But that's not true if only a single GPU is
being utilized. Consequently, this prevented the runtime from informing
the user that that K80 does not have sufficient hardware resources to
execute certain offloaded kernels.
Unfortunately, I don't have a test case which reproduce this failure,
but it does show up in various OpenACC tests such as cloverleaf. I'll
try to create a reduced test case that uses a lot of hardware registers
later.
Cesar
2017-08-11 Cesar Philippidis <cesar@codesourcery.com>
libgomp/
* plugin/cuda/cuda.h (CUdevice_attribute): Add
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR.
* plugin/plugin-nvptx.c (struct ptx_device): Add
compute_capability_major, compute_capability_minor members.
(nvptx_open_device): Probe driver for those values. Adjust
regs_per_sm and max_shared_memory_per_multiprocessor for K80
hardware.
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 25d5d1913b0..94a693cbdef 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -69,6 +69,8 @@ typedef enum {
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
+ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
+ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
} CUdevice_attribute;
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 37e1f6efbe1..10f000ab3c1 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -285,7 +285,9 @@ struct ptx_device
bool map;
bool concur;
bool mkern;
- int mode;
+ int mode;
+ int compute_capability_major;
+ int compute_capability_minor;
int clock_khz;
int num_sms;
int regs_per_block;
@@ -448,6 +450,14 @@ nvptx_open_device (int n)
ptx_dev->mode = pi;
CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
+ ptx_dev->compute_capability_major = pi;
+
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
+ &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
+ ptx_dev->compute_capability_minor = pi;
+
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
ptx_dev->mkern = pi;
@@ -512,20 +522,37 @@ nvptx_open_device (int n)
GOMP_PLUGIN_debug (0, "Nvidia device %d:\n\tGPU_OVERLAP = %d\n"
"\tCAN_MAP_HOST_MEMORY = %d\n\tCONCURRENT_KERNELS = %d\n"
- "\tCOMPUTE_MODE = %d\n\tINTEGRATED = %d\n"
+ "\tCOMPUTE_MODE = %d\n"
+ "\tCU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = %d\n"
+ "\tCU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = %d\n"
+ "\tINTEGRATED = %d\n"
"\tMAX_THREADS_PER_BLOCK = %d\n\tWARP_SIZE = %d\n"
"\tMULTIPROCESSOR_COUNT = %d\n"
"\tMAX_THREADS_PER_MULTIPROCESSOR = %d\n"
"\tMAX_REGISTERS_PER_MULTIPROCESSOR = %d\n"
"\tMAX_SHARED_MEMORY_PER_MULTIPROCESSOR = %d\n",
ptx_dev->ord, ptx_dev->overlap, ptx_dev->map,
- ptx_dev->concur, ptx_dev->mode, ptx_dev->mkern,
- ptx_dev->max_threads_per_block, ptx_dev->warp_size,
- ptx_dev->num_sms,
+ ptx_dev->concur, ptx_dev->mode,
+ ptx_dev->compute_capability_major,
+ ptx_dev->compute_capability_minor,
+ ptx_dev->mkern, ptx_dev->max_threads_per_block,
+ ptx_dev->warp_size, ptx_dev->num_sms,
ptx_dev->max_threads_per_multiprocessor,
ptx_dev->regs_per_sm,
ptx_dev->max_shared_memory_per_multiprocessor);
+ /* K80 (SM_37) boards contain two physical GPUs. Consequntly they
+ report 2x larger values for MAX_REGISTERS_PER_MULTIPROCESSOR and
+ MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. Those values need to be
+ adjusted on order to allow the nvptx_exec to select an
+ appropriate num_workers. */
+ if (ptx_dev->compute_capability_major == 3
+ && ptx_dev->compute_capability_minor == 7)
+ {
+ ptx_dev->regs_per_sm /= 2;
+ ptx_dev->max_shared_memory_per_multiprocessor /= 2;
+ }
+
return ptx_dev;
}