This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

From: Cesar Philippidis <cesar at codesourcery dot com>
To: Tom de Vries <tdevries at suse dot de>, "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
Date: Tue, 31 Jul 2018 07:58:11 -0700
Subject: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

The attached patch teaches libgomp how to use the CUDA thread occupancy
calculator built into the CUDA driver. Despite both being based off the
CUDA thread occupancy spreadsheet distributed with CUDA, the built in
occupancy calculator differs from the occupancy calculator in og8 in two
key ways. First, og8 launches twice the number of gangs as the driver
thread occupancy calculator. This was my attempt at preventing threads
from idling, and it operating on a similar principle of running 'make
-jN', where N is twice the number of CPU threads. Second, whereas og8
always attempts to maximize the CUDA block size, the driver may select a
smaller block, which effectively decreases num_workers.

In terms of performance, there really isn't that much of a difference
between the CUDA driver's occupancy calculator and og8's. However, on
the tests that are impacted, they are generally within a factor of two
from one another, with some tests running faster with the driver
occupancy calculator and others with og8's.

Unfortunately, support for the CUDA driver API isn't universal; it's
only available in CUDA version 6.5 (or 6050) and newer. In this patch,
I'm exploiting the fact that init_cuda_lib only checks for errors on the
last library function initialized. Therefore it guards the usage of

  cuOccupancyMaxPotentialBlockSizeWithFlags

by checking driver_version. If the driver occupancy calculator isn't
available, it falls back to the existing defaults. Maybe the og8 thread
occupancy would make a better default for older versions of CUDA, but
that's a patch for another day.

Is this patch OK for trunk? I bootstrapped and regression tested it
using x86_64 with nvptx offloading.

Thanks,
Cesar

[nvptx] Use CUDA driver API to select default runtime launch geometry

2018-XX-YY  Cesar Philippidis  <cesar@codesourcery.com>
	libgomp/
	plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
	(cuDriverGetVersion): Declare.
	(cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
	plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for
	cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize.
	(ptx_device): Add driver_version member.
	(nvptx_open_device): Initialize it.
	(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
	default num_gangs and num_workers when the driver supports it.
---
 libgomp/plugin/cuda/cuda.h    |  5 +++++
 libgomp/plugin/plugin-nvptx.c | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825..1fc694d 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
 typedef enum {
@@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void);
 CUresult cuDeviceGet (CUdevice *, int);
 CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
 CUresult cuDeviceGetCount (int *);
+CUresult cuDriverGetVersion (int *);
 CUresult cuEventCreate (CUevent *, unsigned);
 #define cuEventDestroy cuEventDestroy_v2
 CUresult cuEventDestroy (CUevent);
@@ -170,6 +172,9 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
 CUresult cuModuleLoad (CUmodule *, const char *);
 CUresult cuModuleLoadData (CUmodule *, const void *);
 CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSizeWithFlags (int *, int *, CUfunction,
+						    CUoccupancyB2DSize, size_t,
+						    int, unsigned int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index b6ec5f8..2647af6 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -63,6 +63,7 @@ CUDA_ONE_CALL (cuCtxSynchronize)	\
 CUDA_ONE_CALL (cuDeviceGet)		\
 CUDA_ONE_CALL (cuDeviceGetAttribute)	\
 CUDA_ONE_CALL (cuDeviceGetCount)	\
+CUDA_ONE_CALL (cuDriverGetVersion)	\
 CUDA_ONE_CALL (cuEventCreate)		\
 CUDA_ONE_CALL (cuEventDestroy)		\
 CUDA_ONE_CALL (cuEventElapsedTime)	\
@@ -94,6 +95,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)	\
 CUDA_ONE_CALL (cuModuleLoad)		\
 CUDA_ONE_CALL (cuModuleLoadData)	\
 CUDA_ONE_CALL (cuModuleUnload)		\
+CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize) \
 CUDA_ONE_CALL (cuStreamCreate)		\
 CUDA_ONE_CALL (cuStreamDestroy)		\
 CUDA_ONE_CALL (cuStreamQuery)		\
@@ -423,6 +425,7 @@ struct ptx_device
   int max_threads_per_block;
   int max_threads_per_multiprocessor;
   int default_dims[GOMP_DIM_MAX];
+  int driver_version;
 
   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
@@ -734,6 +737,7 @@ nvptx_open_device (int n)
   ptx_dev->ord = n;
   ptx_dev->dev = dev;
   ptx_dev->ctx_shared = false;
+  ptx_dev->driver_version = 0;
 
   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
@@ -827,6 +831,9 @@ nvptx_open_device (int n)
   for (int i = 0; i != GOMP_DIM_MAX; i++)
     ptx_dev->default_dims[i] = 0;
 
+  CUDA_CALL_ERET (NULL, cuDriverGetVersion, &pi);
+  ptx_dev->driver_version = pi;
+
   ptx_dev->images = NULL;
   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 
@@ -1220,11 +1227,39 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 
       {
 	bool default_dim_p[GOMP_DIM_MAX];
+	int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR];
+	int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER];
+	int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG];
+
+	/* The CUDA driver occupancy calculator is only available on
+	   CUDA version 6.5 (6050) and newer.  */
+	if (nvthd->ptx_dev->driver_version > 6050)
+	  {
+	    int grids, blocks;
+	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+			      &blocks, function, NULL, 0,
+			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+			       "grid = %d, block = %d\n", grids, blocks);
+
+	    if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_GANG) == 0)
+	      gangs = grids * (blocks / warp_size);
+
+	    if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_WORKER) == 0)
+	      workers = blocks / vectors;
+	  }
+
 	for (i = 0; i != GOMP_DIM_MAX; i++)
 	  {
 	    default_dim_p[i] = !dims[i];
 	    if (default_dim_p[i])
-	      dims[i] = nvthd->ptx_dev->default_dims[i];
+	      switch (i)
+		{
+		case GOMP_DIM_GANG: dims[i] = gangs; break;
+		case GOMP_DIM_WORKER: dims[i] = workers; break;
+		case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+		default: GOMP_PLUGIN_fatal ("invalid dim");
+		}
 	  }
 
 	if (default_dim_p[GOMP_DIM_VECTOR])
-- 
2.7.4

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]