This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Default compute dimensions (runtime)
- From: Nathan Sidwell <nathan at acm dot org>
- To: Jakub Jelinek <jakub at redhat dot com>
- Cc: GCC Patches <gcc-patches at gcc dot gnu dot org>
- Date: Wed, 3 Feb 2016 10:30:43 -0500
- Subject: Default compute dimensions (runtime)
- Authentication-results: sourceware.org; auth=none
Jakub,
this is the runtime side of default compute dimension support.
1) extend the -fopenacc-dim=X:Y:Z syntax to allow '-' indicating a runtime
choice. (0 also indicates that, but I thought best to have an explicit syntax
as well).
2) New plugin helper 'GOMP_PLUGIN_acc_default_dims' that parses a
GOMP_OPENACC_DIM environment variable. The syntax here is the same as that for
the -fopenacc-dim option -- except '-' isn't permitted. I have future-proofed
the interface by including a plugin tag parameter. This will permit
device_type support.
3) the plugin itself lazily calls GOMP_PLUGIN_acc_default_dims when it sees an
unspecified dimension. Validates the default dimensions and then plugs them
into the launch parameters.
The testcase reuses the compile-time testcase by breaking its core to a header
file and explicitly setting the environment variable before first launch. The
original testcase also explitily sets the environment variable, to make sure
it's not being considered.
There doesn't seem to be a mechanism warning messages -- only debug ones or
fatal errors. I'm not sure what the best approach to handling errors in the env
var parsing, and ducked to silently ignore problems (and the plugin will then
provide fallback values).
ok?
nathan
2016-02-03 Nathan Sidwell <nathan@codesourcery.com>
gcc/
* doc/invoke.texi (fopenacc-dim): Document runtime support.
* omp-low.c(oacc_parse_default_dims): Add runtime support.
libgomp/
* libgomp.map (GOMP_PLUGIN_acc_default_dims): New.
* oacc-parallel.c (GOACC_parallel_keyed): Zero initialize dims.
* oacc-plugin.c (GOMP_PLUGIN_acc_default_dims): New.
* oacc-plugin.h (GOMP_PLUGIN_acc_default_dims): Declare.
* plugin/plugin-nvptx.c (nvptx_exec): Add support for runtime
defaul dimensions.
* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: Breakout
body to and #include ...
* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h: ... this.
* testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c: New.
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi (revision 233084)
+++ gcc/doc/invoke.texi (working copy)
@@ -1969,7 +1969,12 @@ have support for @option{-pthread}.
Specify default compute dimensions for parallel offload regions that do
not explicitly specify. The @var{geom} value is a triple of
':'-separated sizes, in order 'gang', 'worker' and, 'vector'. A size
-can be omitted, to use a target-specific default value.
+can be omitted, to use a target-specific default value. Use '-' to defer
+the size determination until execution. In that case, the environment
+variable @var{GOMP_OPENACC_DIM} should be set. It has the same format
+as the option value, except that '-' is not permitted. If it is unset,
+a target-specific value is chosen. Runtime and compile-time values can
+be freely mixed.
@item -fopenmp
@opindex fopenmp
Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c (revision 233084)
+++ gcc/omp-low.c (working copy)
@@ -20275,9 +20275,14 @@ oacc_parse_default_dims (const char *dim
pos++;
}
- if (*pos != ':')
+ long val = -1;
+ if (*pos == '-')
+ {
+ pos++;
+ val = 0;
+ }
+ else if (*pos != ':')
{
- long val;
const char *eptr;
errno = 0;
@@ -20285,8 +20290,8 @@ oacc_parse_default_dims (const char *dim
if (errno || val <= 0 || (int) val != val)
goto malformed;
pos = eptr;
- oacc_default_dims[ix] = (int) val;
}
+ oacc_default_dims[ix] = (int) val;
}
if (*pos)
{
Index: libgomp/libgomp.map
===================================================================
--- libgomp/libgomp.map (revision 233084)
+++ libgomp/libgomp.map (working copy)
@@ -411,4 +411,5 @@ GOMP_PLUGIN_1.0 {
GOMP_PLUGIN_1.1 {
global:
GOMP_PLUGIN_target_task_completion;
+ GOMP_PLUGIN_acc_default_dims;
} GOMP_PLUGIN_1.0;
Index: libgomp/oacc-parallel.c
===================================================================
--- libgomp/oacc-parallel.c (revision 233084)
+++ libgomp/oacc-parallel.c (working copy)
@@ -103,6 +103,7 @@ GOACC_parallel_keyed (int device, void (
return;
}
+ memset (dims, 0, sizeof (dims));
va_start (ap, kinds);
/* TODO: This will need amending when device_type is implemented. */
while ((tag = va_arg (ap, unsigned)) != 0)
Index: libgomp/oacc-plugin.c
===================================================================
--- libgomp/oacc-plugin.c (revision 233084)
+++ libgomp/oacc-plugin.c (working copy)
@@ -29,6 +29,9 @@
#include "libgomp.h"
#include "oacc-plugin.h"
#include "oacc-int.h"
+#include "gomp-constants.h"
+#include <stdlib.h>
+#include <errno.h>
void
GOMP_PLUGIN_async_unmap_vars (void *ptr)
@@ -46,3 +49,41 @@ GOMP_PLUGIN_acc_thread (void)
struct goacc_thread *thr = goacc_thread ();
return thr ? thr->target_tls : NULL;
}
+
+/* Determine runtime default compute dimensions from environment. DIMS
+ must be zero-initialized. Plugin will do remaining default & range
+ validation itself. This should be called lazily on first required
+ use.
+
+ The tag will be used for device_type support. */
+
+void
+GOMP_PLUGIN_acc_default_dims (const char *tag __attribute__((unused)),
+ int *dims)
+{
+ const char *env_var = getenv ("GOMP_OPENACC_DIM");
+
+ if (env_var)
+ {
+ const char *pos = env_var;
+ int i;
+
+ GOMP_PLUGIN_debug (0, "Using GOMP_OPENACC_DIM=%s\n", env_var);
+ for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
+ {
+ if (i && *pos++ != ':')
+ break;
+ if (*pos != ':')
+ {
+ const char *eptr;
+
+ errno = 0;
+ long val = strtol (pos, (char **)&eptr, 10);
+ if (errno || val < 0 || (int) val != val)
+ break;
+ dims[i] = (int) val;
+ pos = eptr;
+ }
+ }
+ }
+}
Index: libgomp/oacc-plugin.h
===================================================================
--- libgomp/oacc-plugin.h (revision 233084)
+++ libgomp/oacc-plugin.h (working copy)
@@ -30,4 +30,6 @@
extern void GOMP_PLUGIN_async_unmap_vars (void *);
extern void *GOMP_PLUGIN_acc_thread (void);
+extern void GOMP_PLUGIN_acc_default_dims (const char *tag, int *dims);
+
#endif
Index: libgomp/plugin/plugin-nvptx.c
===================================================================
--- libgomp/plugin/plugin-nvptx.c (revision 233084)
+++ libgomp/plugin/plugin-nvptx.c (working copy)
@@ -894,9 +894,44 @@ nvptx_exec (void (*fn), size_t mapnum, v
/* Initialize the launch dimensions. Typically this is constant,
provided by the device compiler, but we must permit runtime
values. */
- for (i = 0; i != 3; i++)
- if (targ_fn->launch->dim[i])
- dims[i] = targ_fn->launch->dim[i];
+ int seen_zero = 0;
+ for (i = 0; i != GOMP_DIM_MAX; i++)
+ {
+ if (targ_fn->launch->dim[i])
+ dims[i] = targ_fn->launch->dim[i];
+ if (!dims[i])
+ seen_zero = 1;
+ }
+
+ if (seen_zero)
+ {
+ /* See if the user provided GOMP_OPENACC_DIM environment
+ variable to specify runtime defaults. */
+ static int default_dims[GOMP_DIM_MAX];
+
+ if (!default_dims[0])
+ {
+ GOMP_PLUGIN_acc_default_dims ("nvidia", default_dims);
+
+ /* Do some sanity checking. The CUDA API doesn't appear to
+ provide queries to determine these limits. */
+ if (default_dims[GOMP_DIM_GANG] < 1)
+ default_dims[GOMP_DIM_GANG] = 32;
+ if (default_dims[GOMP_DIM_WORKER] < 1
+ || default_dims[GOMP_DIM_WORKER] > 32)
+ default_dims[GOMP_DIM_WORKER] = 32;
+ default_dims[GOMP_DIM_VECTOR] = 32;
+
+ GOMP_PLUGIN_debug (0, "Default dimensions [%d,%d,%d]\n",
+ default_dims[GOMP_DIM_GANG],
+ default_dims[GOMP_DIM_WORKER],
+ default_dims[GOMP_DIM_VECTOR]);
+ }
+
+ for (i = 0; i != GOMP_DIM_MAX; i++)
+ if (!dims[i])
+ dims[i] = default_dims[i];
+ }
/* This reserves a chunk of a pre-allocated page of memory mapped on both
the host and the device. HP is a host pointer to the new chunk, and DP is
@@ -918,7 +953,8 @@ nvptx_exec (void (*fn), size_t mapnum, v
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
" gangs=%u, workers=%u, vectors=%u\n",
__FUNCTION__, targ_fn->launch->fn,
- dims[0], dims[1], dims[2]);
+ dims[GOMP_DIM_GANG], dims[GOMP_DIM_WORKER],
+ dims[GOMP_DIM_VECTOR]);
// OpenACC CUDA
//
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c (revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default-2.c (working copy)
@@ -0,0 +1,12 @@
+
+/* { dg-additional-options "-O2 -fopenacc-dim=-:-:32" } */
+
+#include "loop-dim-default.h"
+#include <stdlib.h>
+
+int main ()
+{
+ setenv ("GOMP_OPENACC_DIM", "8:16", 1);
+
+ return test_1 (8, 16, 32);
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c (revision 233084)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c (working copy)
@@ -1,133 +1,13 @@
-/* { dg-additional-options "-O2 -fopenacc-dim=16:16" } */
+/* { dg-additional-options "-O2 -fopenacc-dim=16:16:32" } */
-#include <openacc.h>
-#include <alloca.h>
-#include <string.h>
-#include <stdio.h>
-
-#pragma acc routine
-static int __attribute__ ((noinline)) coord ()
-{
- int res = 0;
-
- if (acc_on_device (acc_device_nvidia))
- {
- int g = 0, w = 0, v = 0;
-
- __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
- __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
- __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
- res = (1 << 24) | (g << 16) | (w << 8) | v;
- }
- return res;
-}
-
-
-int check (const int *ary, int size, int gp, int wp, int vp)
-{
- int exit = 0;
- int ix;
- int *gangs = (int *)alloca (gp * sizeof (int));
- int *workers = (int *)alloca (wp * sizeof (int));
- int *vectors = (int *)alloca (vp * sizeof (int));
- int offloaded = 0;
-
- memset (gangs, 0, gp * sizeof (int));
- memset (workers, 0, wp * sizeof (int));
- memset (vectors, 0, vp * sizeof (int));
-
- for (ix = 0; ix < size; ix++)
- {
- int g = (ary[ix] >> 16) & 0xff;
- int w = (ary[ix] >> 8) & 0xff;
- int v = (ary[ix] >> 0) & 0xff;
-
- if (g >= gp || w >= wp || v >= vp)
- {
- printf ("unexpected cpu %#x used\n", ary[ix]);
- exit = 1;
- }
- else
- {
- vectors[v]++;
- workers[w]++;
- gangs[g]++;
- }
- offloaded += ary[ix] >> 24;
- }
-
- if (!offloaded)
- return 0;
-
- if (offloaded != size)
- {
- printf ("offloaded %d times, expected %d\n", offloaded, size);
- return 1;
- }
-
- for (ix = 0; ix < gp; ix++)
- if (gangs[ix] != gangs[0])
- {
- printf ("gang %d not used %d times\n", ix, gangs[0]);
- exit = 1;
- }
-
- for (ix = 0; ix < wp; ix++)
- if (workers[ix] != workers[0])
- {
- printf ("worker %d not used %d times\n", ix, workers[0]);
- exit = 1;
- }
-
- for (ix = 0; ix < vp; ix++)
- if (vectors[ix] != vectors[0])
- {
- printf ("vector %d not used %d times\n", ix, vectors[0]);
- exit = 1;
- }
-
- return exit;
-}
-
-#define N (32 *32*32)
-
-int test_1 (int gp, int wp, int vp)
-{
- int ary[N];
- int exit = 0;
-
-#pragma acc parallel copyout (ary)
- {
-#pragma acc loop gang (static:1)
- for (int ix = 0; ix < N; ix++)
- ary[ix] = coord ();
- }
-
- exit |= check (ary, N, gp, 1, 1);
-
-#pragma acc parallel copyout (ary)
- {
-#pragma acc loop worker
- for (int ix = 0; ix < N; ix++)
- ary[ix] = coord ();
- }
-
- exit |= check (ary, N, 1, wp, 1);
-
-#pragma acc parallel copyout (ary)
- {
-#pragma acc loop vector
- for (int ix = 0; ix < N; ix++)
- ary[ix] = coord ();
- }
-
- exit |= check (ary, N, 1, 1, vp);
-
- return exit;
-}
+#include "loop-dim-default.h"
+#include <stdlib.h>
int main ()
{
+ /* Environment should be (silently) ignored. */
+ setenv ("GOMP_OPENACC_DIM", "8:8:8", 1);
+
return test_1 (16, 16, 32);
}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h (revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.h (working copy)
@@ -0,0 +1,125 @@
+#include <openacc.h>
+#include <alloca.h>
+#include <string.h>
+#include <stdio.h>
+
+#pragma acc routine
+static int __attribute__ ((noinline)) coord ()
+{
+ int res = 0;
+
+ if (acc_on_device (acc_device_nvidia))
+ {
+ int g = 0, w = 0, v = 0;
+
+ __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+ __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+ __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+ res = (1 << 24) | (g << 16) | (w << 8) | v;
+ }
+ return res;
+}
+
+
+int check (const int *ary, int size, int gp, int wp, int vp)
+{
+ int exit = 0;
+ int ix;
+ int *gangs = (int *)alloca (gp * sizeof (int));
+ int *workers = (int *)alloca (wp * sizeof (int));
+ int *vectors = (int *)alloca (vp * sizeof (int));
+ int offloaded = 0;
+
+ memset (gangs, 0, gp * sizeof (int));
+ memset (workers, 0, wp * sizeof (int));
+ memset (vectors, 0, vp * sizeof (int));
+
+ for (ix = 0; ix < size; ix++)
+ {
+ int g = (ary[ix] >> 16) & 0xff;
+ int w = (ary[ix] >> 8) & 0xff;
+ int v = (ary[ix] >> 0) & 0xff;
+
+ if (g >= gp || w >= wp || v >= vp)
+ {
+ printf ("unexpected cpu %#x used\n", ary[ix]);
+ exit = 1;
+ }
+ else
+ {
+ vectors[v]++;
+ workers[w]++;
+ gangs[g]++;
+ }
+ offloaded += ary[ix] >> 24;
+ }
+
+ if (!offloaded)
+ return 0;
+
+ if (offloaded != size)
+ {
+ printf ("offloaded %d times, expected %d\n", offloaded, size);
+ return 1;
+ }
+
+ for (ix = 0; ix < gp; ix++)
+ if (gangs[ix] != gangs[0])
+ {
+ printf ("gang %d not used %d times\n", ix, gangs[0]);
+ exit = 1;
+ }
+
+ for (ix = 0; ix < wp; ix++)
+ if (workers[ix] != workers[0])
+ {
+ printf ("worker %d not used %d times\n", ix, workers[0]);
+ exit = 1;
+ }
+
+ for (ix = 0; ix < vp; ix++)
+ if (vectors[ix] != vectors[0])
+ {
+ printf ("vector %d not used %d times\n", ix, vectors[0]);
+ exit = 1;
+ }
+
+ return exit;
+}
+
+#define N (32 *32*32)
+
+int test_1 (int gp, int wp, int vp)
+{
+ int ary[N];
+ int exit = 0;
+
+#pragma acc parallel copyout (ary)
+ {
+#pragma acc loop gang (static:1)
+ for (int ix = 0; ix < N; ix++)
+ ary[ix] = coord ();
+ }
+
+ exit |= check (ary, N, gp, 1, 1);
+
+#pragma acc parallel copyout (ary)
+ {
+#pragma acc loop worker
+ for (int ix = 0; ix < N; ix++)
+ ary[ix] = coord ();
+ }
+
+ exit |= check (ary, N, 1, wp, 1);
+
+#pragma acc parallel copyout (ary)
+ {
+#pragma acc loop vector
+ for (int ix = 0; ix < N; ix++)
+ ary[ix] = coord ();
+ }
+
+ exit |= check (ary, N, 1, 1, vp);
+
+ return exit;
+}