This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Default compute dimensions (runtime)

From: Nathan Sidwell <nathan at acm dot org>
To: Jakub Jelinek <jakub at redhat dot com>
Cc: GCC Patches <gcc-patches at gcc dot gnu dot org>
Date: Wed, 3 Feb 2016 10:30:43 -0500
Subject: Default compute dimensions (runtime)
Authentication-results: sourceware.org; auth=none

Jakub,
this is the runtime side of default compute dimension support.

1) extend the -fopenacc-dim=X:Y:Z syntax to allow '-' indicating a runtimechoice. (0 also indicates that, but I thought best to have an explicit syntaxas well).

2) New plugin helper 'GOMP_PLUGIN_acc_default_dims' that parses aGOMP_OPENACC_DIM environment variable. The syntax here is the same as that forthe -fopenacc-dim option -- except '-' isn't permitted. I have future-proofedthe interface by including a plugin tag parameter. This will permitdevice_type support.

3) the plugin itself lazily calls GOMP_PLUGIN_acc_default_dims when it sees anunspecified dimension. Validates the default dimensions and then plugs theminto the launch parameters.

The testcase reuses the compile-time testcase by breaking its core to a headerfile and explicitly setting the environment variable before first launch. Theoriginal testcase also explitily sets the environment variable, to make sureit's not being considered.

There doesn't seem to be a mechanism warning messages -- only debug ones orfatal errors. I'm not sure what the best approach to handling errors in the envvar parsing, and ducked to silently ignore problems (and the plugin will thenprovide fallback values).


ok?

nathan

2016-02-03  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* doc/invoke.texi (fopenacc-dim): Document runtime support.
	* omp-low.c(oacc_parse_default_dims): Add runtime support.

	libgomp/
	* libgomp.map (GOMP_PLUGIN_acc_default_dims): New.
	* oacc-parallel.c (GOACC_parallel_keyed): Zero initialize dims.
	* oacc-plugin.c (GOMP_PLUGIN_acc_default_dims): New.
	* oacc-plugin.h (GOMP_PLUGIN_acc_default_dims): Declare.
	* plugin/plugin-nvptx.c (nvptx_exec): Add support for runtime
	defaul dimensions.
	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: Breakout
	body to and #include ...
	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h: ... this.
	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c: New.

Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi	(revision 233084)
+++ gcc/doc/invoke.texi	(working copy)
@@ -1969,7 +1969,12 @@ have support for @option{-pthread}.
 Specify default compute dimensions for parallel offload regions that do
 not explicitly specify.  The @var{geom} value is a triple of
 ':'-separated sizes, in order 'gang', 'worker' and, 'vector'.  A size
-can be omitted, to use a target-specific default value.
+can be omitted, to use a target-specific default value. Use '-' to defer
+the size determination until execution.  In that case, the environment
+variable @var{GOMP_OPENACC_DIM} should be set.  It has the same format
+as the option value, except that '-' is not permitted.  If it is unset,
+a target-specific value is chosen. Runtime and compile-time values can
+be freely mixed.
 
 @item -fopenmp
 @opindex fopenmp
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 233084)
+++ gcc/omp-low.c	(working copy)
@@ -20275,9 +20275,14 @@ oacc_parse_default_dims (const char *dim
 	      pos++;
 	    }
 
-	  if (*pos != ':')
+	  long val = -1;
+	  if (*pos == '-')
+	    {
+	      pos++;
+	      val = 0;
+	    }
+	  else if (*pos != ':')
 	    {
-	      long val;
 	      const char *eptr;
 
 	      errno = 0;
@@ -20285,8 +20290,8 @@ oacc_parse_default_dims (const char *dim
 	      if (errno || val <= 0 || (int) val != val)
 		goto malformed;
 	      pos = eptr;
-	      oacc_default_dims[ix] = (int) val;
 	    }
+	  oacc_default_dims[ix] = (int) val;
 	}
       if (*pos)
 	{
Index: libgomp/libgomp.map
===================================================================
--- libgomp/libgomp.map	(revision 233084)
+++ libgomp/libgomp.map	(working copy)
@@ -411,4 +411,5 @@ GOMP_PLUGIN_1.0 {
 GOMP_PLUGIN_1.1 {
   global:
 	GOMP_PLUGIN_target_task_completion;
+	GOMP_PLUGIN_acc_default_dims;
 } GOMP_PLUGIN_1.0;
Index: libgomp/oacc-parallel.c
===================================================================
--- libgomp/oacc-parallel.c	(revision 233084)
+++ libgomp/oacc-parallel.c	(working copy)
@@ -103,6 +103,7 @@ GOACC_parallel_keyed (int device, void (
       return;
     }
 
+  memset (dims, 0, sizeof (dims));
   va_start (ap, kinds);
   /* TODO: This will need amending when device_type is implemented.  */
   while ((tag = va_arg (ap, unsigned)) != 0)
Index: libgomp/oacc-plugin.c
===================================================================
--- libgomp/oacc-plugin.c	(revision 233084)
+++ libgomp/oacc-plugin.c	(working copy)
@@ -29,6 +29,9 @@
 #include "libgomp.h"
 #include "oacc-plugin.h"
 #include "oacc-int.h"
+#include "gomp-constants.h"
+#include <stdlib.h>
+#include <errno.h>
 
 void
 GOMP_PLUGIN_async_unmap_vars (void *ptr)
@@ -46,3 +49,41 @@ GOMP_PLUGIN_acc_thread (void)
   struct goacc_thread *thr = goacc_thread ();
   return thr ? thr->target_tls : NULL;
 }
+
+/* Determine runtime default compute dimensions from environment. DIMS
+   must be zero-initialized.  Plugin will do remaining default & range
+   validation itself.  This should be called lazily on first required
+   use.
+
+   The tag will be used for device_type support.  */
+
+void
+GOMP_PLUGIN_acc_default_dims (const char *tag __attribute__((unused)),
+			      int *dims)
+{
+  const char *env_var = getenv ("GOMP_OPENACC_DIM");
+
+  if (env_var)
+    {
+      const char *pos = env_var;
+      int i;
+
+      GOMP_PLUGIN_debug (0, "Using GOMP_OPENACC_DIM=%s\n", env_var);
+      for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
+	{
+	  if (i && *pos++ != ':')
+	    break;
+	  if (*pos != ':')
+	    {
+	      const char *eptr;
+
+	      errno = 0;
+	      long val = strtol (pos, (char **)&eptr, 10);
+	      if (errno || val < 0 || (int) val != val)
+		break;
+	      dims[i] = (int) val;
+	      pos = eptr;
+	    }
+	}
+    }
+}
Index: libgomp/oacc-plugin.h
===================================================================
--- libgomp/oacc-plugin.h	(revision 233084)
+++ libgomp/oacc-plugin.h	(working copy)
@@ -30,4 +30,6 @@
 extern void GOMP_PLUGIN_async_unmap_vars (void *);
 extern void *GOMP_PLUGIN_acc_thread (void);
 
+extern void GOMP_PLUGIN_acc_default_dims (const char *tag, int *dims);
+
 #endif
Index: libgomp/plugin/plugin-nvptx.c
===================================================================
--- libgomp/plugin/plugin-nvptx.c	(revision 233084)
+++ libgomp/plugin/plugin-nvptx.c	(working copy)
@@ -894,9 +894,44 @@ nvptx_exec (void (*fn), size_t mapnum, v
   /* Initialize the launch dimensions.  Typically this is constant,
      provided by the device compiler, but we must permit runtime
      values.  */
-  for (i = 0; i != 3; i++)
-    if (targ_fn->launch->dim[i])
-      dims[i] = targ_fn->launch->dim[i];
+  int seen_zero = 0;
+  for (i = 0; i != GOMP_DIM_MAX; i++)
+    {
+      if (targ_fn->launch->dim[i])
+	dims[i] = targ_fn->launch->dim[i];
+      if (!dims[i])
+	seen_zero = 1;
+    }
+
+  if (seen_zero)
+    {
+      /* See if the user provided GOMP_OPENACC_DIM environment
+	 variable to specify runtime defaults. */
+      static int default_dims[GOMP_DIM_MAX];
+
+      if (!default_dims[0])
+	{
+	  GOMP_PLUGIN_acc_default_dims ("nvidia", default_dims);
+
+	  /* Do some sanity checking.  The CUDA API doesn't appear to
+	     provide queries to determine these limits.  */
+	  if (default_dims[GOMP_DIM_GANG] < 1)
+	    default_dims[GOMP_DIM_GANG] = 32;
+	  if (default_dims[GOMP_DIM_WORKER] < 1
+	      || default_dims[GOMP_DIM_WORKER] > 32)
+	    default_dims[GOMP_DIM_WORKER] = 32;
+	  default_dims[GOMP_DIM_VECTOR] = 32;
+
+	  GOMP_PLUGIN_debug (0, "Default dimensions [%d,%d,%d]\n",
+			     default_dims[GOMP_DIM_GANG],
+			     default_dims[GOMP_DIM_WORKER],
+			     default_dims[GOMP_DIM_VECTOR]);
+	}
+
+      for (i = 0; i != GOMP_DIM_MAX; i++)
+	if (!dims[i])
+	  dims[i] = default_dims[i];
+    }
 
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
      the host and the device. HP is a host pointer to the new chunk, and DP is
@@ -918,7 +953,8 @@ nvptx_exec (void (*fn), size_t mapnum, v
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " gangs=%u, workers=%u, vectors=%u\n",
 		     __FUNCTION__, targ_fn->launch->fn,
-		     dims[0], dims[1], dims[2]);
+		     dims[GOMP_DIM_GANG], dims[GOMP_DIM_WORKER],
+		     dims[GOMP_DIM_VECTOR]);
 
   // OpenACC		CUDA
   //
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c	(working copy)
@@ -0,0 +1,12 @@
+
+/* { dg-additional-options "-O2 -fopenacc-dim=-:-:32" } */
+
+#include "loop-dim-default.h"
+#include <stdlib.h>
+
+int main ()
+{
+  setenv ("GOMP_OPENACC_DIM", "8:16", 1);
+
+  return test_1 (8, 16, 32);
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c	(revision 233084)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c	(working copy)
@@ -1,133 +1,13 @@
 
-/* { dg-additional-options "-O2 -fopenacc-dim=16:16" } */
+/* { dg-additional-options "-O2 -fopenacc-dim=16:16:32" } */
 
-#include <openacc.h>
-#include <alloca.h>
-#include <string.h>
-#include <stdio.h>
-
-#pragma acc routine
-static int __attribute__ ((noinline)) coord ()
-{
-  int res = 0;
-
-  if (acc_on_device (acc_device_nvidia))
-    {
-      int g = 0, w = 0, v = 0;
-
-      __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
-      __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
-      __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
-      res = (1 << 24) | (g << 16) | (w << 8) | v;
-    }
-  return res;
-}
-
-
-int check (const int *ary, int size, int gp, int wp, int vp)
-{
-  int exit = 0;
-  int ix;
-  int *gangs = (int *)alloca (gp * sizeof (int));
-  int *workers = (int *)alloca (wp * sizeof (int));
-  int *vectors = (int *)alloca (vp * sizeof (int));
-  int offloaded = 0;
-  
-  memset (gangs, 0, gp * sizeof (int));
-  memset (workers, 0, wp * sizeof (int));
-  memset (vectors, 0, vp * sizeof (int));
-  
-  for (ix = 0; ix < size; ix++)
-    {
-      int g = (ary[ix] >> 16) & 0xff;
-      int w = (ary[ix] >> 8) & 0xff;
-      int v = (ary[ix] >> 0) & 0xff;
-
-      if (g >= gp || w >= wp || v >= vp)
-	{
-	  printf ("unexpected cpu %#x used\n", ary[ix]);
-	  exit = 1;
-	}
-      else
-	{
-	  vectors[v]++;
-	  workers[w]++;
-	  gangs[g]++;
-	}
-      offloaded += ary[ix] >> 24;
-    }
-
-  if (!offloaded)
-    return 0;
-
-  if (offloaded != size)
-    {
-      printf ("offloaded %d times,  expected %d\n", offloaded, size);
-      return 1;
-    }
-
-  for (ix = 0; ix < gp; ix++)
-    if (gangs[ix] != gangs[0])
-      {
-	printf ("gang %d not used %d times\n", ix, gangs[0]);
-	exit = 1;
-      }
-  
-  for (ix = 0; ix < wp; ix++)
-    if (workers[ix] != workers[0])
-      {
-	printf ("worker %d not used %d times\n", ix, workers[0]);
-	exit = 1;
-      }
-  
-  for (ix = 0; ix < vp; ix++)
-    if (vectors[ix] != vectors[0])
-      {
-	printf ("vector %d not used %d times\n", ix, vectors[0]);
-	exit = 1;
-      }
-  
-  return exit;
-}
-
-#define N (32 *32*32)
-
-int test_1 (int gp, int wp, int vp)
-{
-  int ary[N];
-  int exit = 0;
-  
-#pragma acc parallel copyout (ary)
-  {
-#pragma acc loop gang (static:1)
-    for (int ix = 0; ix < N; ix++)
-      ary[ix] = coord ();
-  }
-
-  exit |= check (ary, N, gp, 1, 1);
-
-#pragma  acc parallel copyout (ary)
-  {
-#pragma acc loop worker
-    for (int ix = 0; ix < N; ix++)
-      ary[ix] = coord ();
-  }
-
-  exit |= check (ary, N, 1, wp, 1);
-
-#pragma  acc parallel copyout (ary)
-  {
-#pragma acc loop vector
-    for (int ix = 0; ix < N; ix++)
-      ary[ix] = coord ();
-  }
-
-  exit |= check (ary, N, 1, 1, vp);
-
-  return exit;
-}
+#include "loop-dim-default.h"
+#include <stdlib.h>
 
 int main ()
 {
+  /* Environment should be (silently) ignored.  */
+  setenv ("GOMP_OPENACC_DIM", "8:8:8",  1);
+
   return test_1 (16, 16, 32);
 }
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h	(working copy)
@@ -0,0 +1,125 @@
+#include <openacc.h>
+#include <alloca.h>
+#include <string.h>
+#include <stdio.h>
+
+#pragma acc routine
+static int __attribute__ ((noinline)) coord ()
+{
+  int res = 0;
+
+  if (acc_on_device (acc_device_nvidia))
+    {
+      int g = 0, w = 0, v = 0;
+
+      __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+      __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+      __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+      res = (1 << 24) | (g << 16) | (w << 8) | v;
+    }
+  return res;
+}
+
+
+int check (const int *ary, int size, int gp, int wp, int vp)
+{
+  int exit = 0;
+  int ix;
+  int *gangs = (int *)alloca (gp * sizeof (int));
+  int *workers = (int *)alloca (wp * sizeof (int));
+  int *vectors = (int *)alloca (vp * sizeof (int));
+  int offloaded = 0;
+  
+  memset (gangs, 0, gp * sizeof (int));
+  memset (workers, 0, wp * sizeof (int));
+  memset (vectors, 0, vp * sizeof (int));
+  
+  for (ix = 0; ix < size; ix++)
+    {
+      int g = (ary[ix] >> 16) & 0xff;
+      int w = (ary[ix] >> 8) & 0xff;
+      int v = (ary[ix] >> 0) & 0xff;
+
+      if (g >= gp || w >= wp || v >= vp)
+	{
+	  printf ("unexpected cpu %#x used\n", ary[ix]);
+	  exit = 1;
+	}
+      else
+	{
+	  vectors[v]++;
+	  workers[w]++;
+	  gangs[g]++;
+	}
+      offloaded += ary[ix] >> 24;
+    }
+
+  if (!offloaded)
+    return 0;
+
+  if (offloaded != size)
+    {
+      printf ("offloaded %d times,  expected %d\n", offloaded, size);
+      return 1;
+    }
+
+  for (ix = 0; ix < gp; ix++)
+    if (gangs[ix] != gangs[0])
+      {
+	printf ("gang %d not used %d times\n", ix, gangs[0]);
+	exit = 1;
+      }
+  
+  for (ix = 0; ix < wp; ix++)
+    if (workers[ix] != workers[0])
+      {
+	printf ("worker %d not used %d times\n", ix, workers[0]);
+	exit = 1;
+      }
+  
+  for (ix = 0; ix < vp; ix++)
+    if (vectors[ix] != vectors[0])
+      {
+	printf ("vector %d not used %d times\n", ix, vectors[0]);
+	exit = 1;
+      }
+  
+  return exit;
+}
+
+#define N (32 *32*32)
+
+int test_1 (int gp, int wp, int vp)
+{
+  int ary[N];
+  int exit = 0;
+  
+#pragma acc parallel copyout (ary)
+  {
+#pragma acc loop gang (static:1)
+    for (int ix = 0; ix < N; ix++)
+      ary[ix] = coord ();
+  }
+
+  exit |= check (ary, N, gp, 1, 1);
+
+#pragma  acc parallel copyout (ary)
+  {
+#pragma acc loop worker
+    for (int ix = 0; ix < N; ix++)
+      ary[ix] = coord ();
+  }
+
+  exit |= check (ary, N, 1, wp, 1);
+
+#pragma  acc parallel copyout (ary)
+  {
+#pragma acc loop vector
+    for (int ix = 0; ix < N; ix++)
+      ary[ix] = coord ();
+  }
+
+  exit |= check (ary, N, 1, 1, vp);
+
+  return exit;
+}

Follow-Ups:
- Re: Default compute dimensions (runtime)
  - From: Alexander Monakov

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]