This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Default compute dimensions (runtime)


Jakub,
this is the runtime side of default compute dimension support.

1) extend the -fopenacc-dim=X:Y:Z syntax to allow '-' indicating a runtime choice. (0 also indicates that, but I thought best to have an explicit syntax as well).

2) New plugin helper 'GOMP_PLUGIN_acc_default_dims' that parses a GOMP_OPENACC_DIM environment variable. The syntax here is the same as that for the -fopenacc-dim option -- except '-' isn't permitted. I have future-proofed the interface by including a plugin tag parameter. This will permit device_type support.

3) the plugin itself lazily calls GOMP_PLUGIN_acc_default_dims when it sees an unspecified dimension. Validates the default dimensions and then plugs them into the launch parameters.

The testcase reuses the compile-time testcase by breaking its core to a header file and explicitly setting the environment variable before first launch. The original testcase also explitily sets the environment variable, to make sure it's not being considered.

There doesn't seem to be a mechanism warning messages -- only debug ones or fatal errors. I'm not sure what the best approach to handling errors in the env var parsing, and ducked to silently ignore problems (and the plugin will then provide fallback values).

ok?

nathan
2016-02-03  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* doc/invoke.texi (fopenacc-dim): Document runtime support.
	* omp-low.c(oacc_parse_default_dims): Add runtime support.

	libgomp/
	* libgomp.map (GOMP_PLUGIN_acc_default_dims): New.
	* oacc-parallel.c (GOACC_parallel_keyed): Zero initialize dims.
	* oacc-plugin.c (GOMP_PLUGIN_acc_default_dims): New.
	* oacc-plugin.h (GOMP_PLUGIN_acc_default_dims): Declare.
	* plugin/plugin-nvptx.c (nvptx_exec): Add support for runtime
	defaul dimensions.
	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: Breakout
	body to and #include ...
	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h: ... this.
	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c: New.

Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi	(revision 233084)
+++ gcc/doc/invoke.texi	(working copy)
@@ -1969,7 +1969,12 @@ have support for @option{-pthread}.
 Specify default compute dimensions for parallel offload regions that do
 not explicitly specify.  The @var{geom} value is a triple of
 ':'-separated sizes, in order 'gang', 'worker' and, 'vector'.  A size
-can be omitted, to use a target-specific default value.
+can be omitted, to use a target-specific default value. Use '-' to defer
+the size determination until execution.  In that case, the environment
+variable @var{GOMP_OPENACC_DIM} should be set.  It has the same format
+as the option value, except that '-' is not permitted.  If it is unset,
+a target-specific value is chosen. Runtime and compile-time values can
+be freely mixed.
 
 @item -fopenmp
 @opindex fopenmp
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 233084)
+++ gcc/omp-low.c	(working copy)
@@ -20275,9 +20275,14 @@ oacc_parse_default_dims (const char *dim
 	      pos++;
 	    }
 
-	  if (*pos != ':')
+	  long val = -1;
+	  if (*pos == '-')
+	    {
+	      pos++;
+	      val = 0;
+	    }
+	  else if (*pos != ':')
 	    {
-	      long val;
 	      const char *eptr;
 
 	      errno = 0;
@@ -20285,8 +20290,8 @@ oacc_parse_default_dims (const char *dim
 	      if (errno || val <= 0 || (int) val != val)
 		goto malformed;
 	      pos = eptr;
-	      oacc_default_dims[ix] = (int) val;
 	    }
+	  oacc_default_dims[ix] = (int) val;
 	}
       if (*pos)
 	{
Index: libgomp/libgomp.map
===================================================================
--- libgomp/libgomp.map	(revision 233084)
+++ libgomp/libgomp.map	(working copy)
@@ -411,4 +411,5 @@ GOMP_PLUGIN_1.0 {
 GOMP_PLUGIN_1.1 {
   global:
 	GOMP_PLUGIN_target_task_completion;
+	GOMP_PLUGIN_acc_default_dims;
 } GOMP_PLUGIN_1.0;
Index: libgomp/oacc-parallel.c
===================================================================
--- libgomp/oacc-parallel.c	(revision 233084)
+++ libgomp/oacc-parallel.c	(working copy)
@@ -103,6 +103,7 @@ GOACC_parallel_keyed (int device, void (
       return;
     }
 
+  memset (dims, 0, sizeof (dims));
   va_start (ap, kinds);
   /* TODO: This will need amending when device_type is implemented.  */
   while ((tag = va_arg (ap, unsigned)) != 0)
Index: libgomp/oacc-plugin.c
===================================================================
--- libgomp/oacc-plugin.c	(revision 233084)
+++ libgomp/oacc-plugin.c	(working copy)
@@ -29,6 +29,9 @@
 #include "libgomp.h"
 #include "oacc-plugin.h"
 #include "oacc-int.h"
+#include "gomp-constants.h"
+#include <stdlib.h>
+#include <errno.h>
 
 void
 GOMP_PLUGIN_async_unmap_vars (void *ptr)
@@ -46,3 +49,41 @@ GOMP_PLUGIN_acc_thread (void)
   struct goacc_thread *thr = goacc_thread ();
   return thr ? thr->target_tls : NULL;
 }
+
+/* Determine runtime default compute dimensions from environment. DIMS
+   must be zero-initialized.  Plugin will do remaining default & range
+   validation itself.  This should be called lazily on first required
+   use.
+
+   The tag will be used for device_type support.  */
+
+void
+GOMP_PLUGIN_acc_default_dims (const char *tag __attribute__((unused)),
+			      int *dims)
+{
+  const char *env_var = getenv ("GOMP_OPENACC_DIM");
+
+  if (env_var)
+    {
+      const char *pos = env_var;
+      int i;
+
+      GOMP_PLUGIN_debug (0, "Using GOMP_OPENACC_DIM=%s\n", env_var);
+      for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
+	{
+	  if (i && *pos++ != ':')
+	    break;
+	  if (*pos != ':')
+	    {
+	      const char *eptr;
+
+	      errno = 0;
+	      long val = strtol (pos, (char **)&eptr, 10);
+	      if (errno || val < 0 || (int) val != val)
+		break;
+	      dims[i] = (int) val;
+	      pos = eptr;
+	    }
+	}
+    }
+}
Index: libgomp/oacc-plugin.h
===================================================================
--- libgomp/oacc-plugin.h	(revision 233084)
+++ libgomp/oacc-plugin.h	(working copy)
@@ -30,4 +30,6 @@
 extern void GOMP_PLUGIN_async_unmap_vars (void *);
 extern void *GOMP_PLUGIN_acc_thread (void);
 
+extern void GOMP_PLUGIN_acc_default_dims (const char *tag, int *dims);
+
 #endif
Index: libgomp/plugin/plugin-nvptx.c
===================================================================
--- libgomp/plugin/plugin-nvptx.c	(revision 233084)
+++ libgomp/plugin/plugin-nvptx.c	(working copy)
@@ -894,9 +894,44 @@ nvptx_exec (void (*fn), size_t mapnum, v
   /* Initialize the launch dimensions.  Typically this is constant,
      provided by the device compiler, but we must permit runtime
      values.  */
-  for (i = 0; i != 3; i++)
-    if (targ_fn->launch->dim[i])
-      dims[i] = targ_fn->launch->dim[i];
+  int seen_zero = 0;
+  for (i = 0; i != GOMP_DIM_MAX; i++)
+    {
+      if (targ_fn->launch->dim[i])
+	dims[i] = targ_fn->launch->dim[i];
+      if (!dims[i])
+	seen_zero = 1;
+    }
+
+  if (seen_zero)
+    {
+      /* See if the user provided GOMP_OPENACC_DIM environment
+	 variable to specify runtime defaults. */
+      static int default_dims[GOMP_DIM_MAX];
+
+      if (!default_dims[0])
+	{
+	  GOMP_PLUGIN_acc_default_dims ("nvidia", default_dims);
+
+	  /* Do some sanity checking.  The CUDA API doesn't appear to
+	     provide queries to determine these limits.  */
+	  if (default_dims[GOMP_DIM_GANG] < 1)
+	    default_dims[GOMP_DIM_GANG] = 32;
+	  if (default_dims[GOMP_DIM_WORKER] < 1
+	      || default_dims[GOMP_DIM_WORKER] > 32)
+	    default_dims[GOMP_DIM_WORKER] = 32;
+	  default_dims[GOMP_DIM_VECTOR] = 32;
+
+	  GOMP_PLUGIN_debug (0, "Default dimensions [%d,%d,%d]\n",
+			     default_dims[GOMP_DIM_GANG],
+			     default_dims[GOMP_DIM_WORKER],
+			     default_dims[GOMP_DIM_VECTOR]);
+	}
+
+      for (i = 0; i != GOMP_DIM_MAX; i++)
+	if (!dims[i])
+	  dims[i] = default_dims[i];
+    }
 
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
      the host and the device. HP is a host pointer to the new chunk, and DP is
@@ -918,7 +953,8 @@ nvptx_exec (void (*fn), size_t mapnum, v
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " gangs=%u, workers=%u, vectors=%u\n",
 		     __FUNCTION__, targ_fn->launch->fn,
-		     dims[0], dims[1], dims[2]);
+		     dims[GOMP_DIM_GANG], dims[GOMP_DIM_WORKER],
+		     dims[GOMP_DIM_VECTOR]);
 
   // OpenACC		CUDA
   //
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c	(working copy)
@@ -0,0 +1,12 @@
+
+/* { dg-additional-options "-O2 -fopenacc-dim=-:-:32" } */
+
+#include "loop-dim-default.h"
+#include <stdlib.h>
+
+int main ()
+{
+  setenv ("GOMP_OPENACC_DIM", "8:16", 1);
+
+  return test_1 (8, 16, 32);
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c	(revision 233084)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c	(working copy)
@@ -1,133 +1,13 @@
 
-/* { dg-additional-options "-O2 -fopenacc-dim=16:16" } */
+/* { dg-additional-options "-O2 -fopenacc-dim=16:16:32" } */
 
-#include <openacc.h>
-#include <alloca.h>
-#include <string.h>
-#include <stdio.h>
-
-#pragma acc routine
-static int __attribute__ ((noinline)) coord ()
-{
-  int res = 0;
-
-  if (acc_on_device (acc_device_nvidia))
-    {
-      int g = 0, w = 0, v = 0;
-
-      __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-      __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-      __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
-      res = (1 << 24) | (g << 16) | (w << 8) | v;
-    }
-  return res;
-}
-
-
-int check (const int *ary, int size, int gp, int wp, int vp)
-{
-  int exit = 0;
-  int ix;
-  int *gangs = (int *)alloca (gp * sizeof (int));
-  int *workers = (int *)alloca (wp * sizeof (int));
-  int *vectors = (int *)alloca (vp * sizeof (int));
-  int offloaded = 0;
-  
-  memset (gangs, 0, gp * sizeof (int));
-  memset (workers, 0, wp * sizeof (int));
-  memset (vectors, 0, vp * sizeof (int));
-  
-  for (ix = 0; ix < size; ix++)
-    {
-      int g = (ary[ix] >> 16) & 0xff;
-      int w = (ary[ix] >> 8) & 0xff;
-      int v = (ary[ix] >> 0) & 0xff;
-
-      if (g >= gp || w >= wp || v >= vp)
-	{
-	  printf ("unexpected cpu %#x used\n", ary[ix]);
-	  exit = 1;
-	}
-      else
-	{
-	  vectors[v]++;
-	  workers[w]++;
-	  gangs[g]++;
-	}
-      offloaded += ary[ix] >> 24;
-    }
-
-  if (!offloaded)
-    return 0;
-
-  if (offloaded != size)
-    {
-      printf ("offloaded %d times,  expected %d\n", offloaded, size);
-      return 1;
-    }
-
-  for (ix = 0; ix < gp; ix++)
-    if (gangs[ix] != gangs[0])
-      {
-	printf ("gang %d not used %d times\n", ix, gangs[0]);
-	exit = 1;
-      }
-  
-  for (ix = 0; ix < wp; ix++)
-    if (workers[ix] != workers[0])
-      {
-	printf ("worker %d not used %d times\n", ix, workers[0]);
-	exit = 1;
-      }
-  
-  for (ix = 0; ix < vp; ix++)
-    if (vectors[ix] != vectors[0])
-      {
-	printf ("vector %d not used %d times\n", ix, vectors[0]);
-	exit = 1;
-      }
-  
-  return exit;
-}
-
-#define N (32 *32*32)
-
-int test_1 (int gp, int wp, int vp)
-{
-  int ary[N];
-  int exit = 0;
-  
-#pragma acc parallel copyout (ary)
-  {
-#pragma acc loop gang (static:1)
-    for (int ix = 0; ix < N; ix++)
-      ary[ix] = coord ();
-  }
-
-  exit |= check (ary, N, gp, 1, 1);
-
-#pragma  acc parallel copyout (ary)
-  {
-#pragma acc loop worker
-    for (int ix = 0; ix < N; ix++)
-      ary[ix] = coord ();
-  }
-
-  exit |= check (ary, N, 1, wp, 1);
-
-#pragma  acc parallel copyout (ary)
-  {
-#pragma acc loop vector
-    for (int ix = 0; ix < N; ix++)
-      ary[ix] = coord ();
-  }
-
-  exit |= check (ary, N, 1, 1, vp);
-
-  return exit;
-}
+#include "loop-dim-default.h"
+#include <stdlib.h>
 
 int main ()
 {
+  /* Environment should be (silently) ignored.  */
+  setenv ("GOMP_OPENACC_DIM", "8:8:8",  1);
+
   return test_1 (16, 16, 32);
 }
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h	(working copy)
@@ -0,0 +1,125 @@
+#include <openacc.h>
+#include <alloca.h>
+#include <string.h>
+#include <stdio.h>
+
+#pragma acc routine
+static int __attribute__ ((noinline)) coord ()
+{
+  int res = 0;
+
+  if (acc_on_device (acc_device_nvidia))
+    {
+      int g = 0, w = 0, v = 0;
+
+      __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+      __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+      __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+      res = (1 << 24) | (g << 16) | (w << 8) | v;
+    }
+  return res;
+}
+
+
+int check (const int *ary, int size, int gp, int wp, int vp)
+{
+  int exit = 0;
+  int ix;
+  int *gangs = (int *)alloca (gp * sizeof (int));
+  int *workers = (int *)alloca (wp * sizeof (int));
+  int *vectors = (int *)alloca (vp * sizeof (int));
+  int offloaded = 0;
+  
+  memset (gangs, 0, gp * sizeof (int));
+  memset (workers, 0, wp * sizeof (int));
+  memset (vectors, 0, vp * sizeof (int));
+  
+  for (ix = 0; ix < size; ix++)
+    {
+      int g = (ary[ix] >> 16) & 0xff;
+      int w = (ary[ix] >> 8) & 0xff;
+      int v = (ary[ix] >> 0) & 0xff;
+
+      if (g >= gp || w >= wp || v >= vp)
+	{
+	  printf ("unexpected cpu %#x used\n", ary[ix]);
+	  exit = 1;
+	}
+      else
+	{
+	  vectors[v]++;
+	  workers[w]++;
+	  gangs[g]++;
+	}
+      offloaded += ary[ix] >> 24;
+    }
+
+  if (!offloaded)
+    return 0;
+
+  if (offloaded != size)
+    {
+      printf ("offloaded %d times,  expected %d\n", offloaded, size);
+      return 1;
+    }
+
+  for (ix = 0; ix < gp; ix++)
+    if (gangs[ix] != gangs[0])
+      {
+	printf ("gang %d not used %d times\n", ix, gangs[0]);
+	exit = 1;
+      }
+  
+  for (ix = 0; ix < wp; ix++)
+    if (workers[ix] != workers[0])
+      {
+	printf ("worker %d not used %d times\n", ix, workers[0]);
+	exit = 1;
+      }
+  
+  for (ix = 0; ix < vp; ix++)
+    if (vectors[ix] != vectors[0])
+      {
+	printf ("vector %d not used %d times\n", ix, vectors[0]);
+	exit = 1;
+      }
+  
+  return exit;
+}
+
+#define N (32 *32*32)
+
+int test_1 (int gp, int wp, int vp)
+{
+  int ary[N];
+  int exit = 0;
+  
+#pragma acc parallel copyout (ary)
+  {
+#pragma acc loop gang (static:1)
+    for (int ix = 0; ix < N; ix++)
+      ary[ix] = coord ();
+  }
+
+  exit |= check (ary, N, gp, 1, 1);
+
+#pragma  acc parallel copyout (ary)
+  {
+#pragma acc loop worker
+    for (int ix = 0; ix < N; ix++)
+      ary[ix] = coord ();
+  }
+
+  exit |= check (ary, N, 1, wp, 1);
+
+#pragma  acc parallel copyout (ary)
+  {
+#pragma acc loop vector
+    for (int ix = 0; ix < N; ix++)
+      ary[ix] = coord ();
+  }
+
+  exit |= check (ary, N, 1, 1, vp);
+
+  return exit;
+}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]