This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [og7] vector_length extension part 2: Generalize state propagation and synchronization
- From: Tom de Vries <Tom_deVries at mentor dot com>
- To: Cesar Philippidis <cesar at codesourcery dot com>
- Cc: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Thu, 5 Apr 2018 18:33:30 +0200
- Subject: Re: [og7] vector_length extension part 2: Generalize state propagation and synchronization
- References: <d6642c62-6d01-10ce-dda2-f5fa453ed971@codesourcery.com> <823cc381-8752-14df-d6e2-0203de5da2fb@codesourcery.com> <5f60a648-184d-6ada-7412-b931b087826e@mentor.com> <6c698973-520b-26bc-ee83-c17077442b85@mentor.com> <f7cf9222-1b3c-be8e-b95c-324d0ec0211b@codesourcery.com> <7a99e1df-98cb-1df3-03bc-ac2888b3637f@mentor.com>
On 03/30/2018 05:14 PM, Tom de Vries wrote:
On 03/30/2018 05:00 PM, Cesar Philippidis wrote:
I should
have checked that patch with the vector length fallback disabled.
Right. The patch series introduces a lot of code that is not exercised.
I've added an -mlong-vector-in-workers option in my local branch and
added 3 test-cases to exercise the code with fallback disabled everytime
I run the libgomp tests.
This patch adds that option.
Build x86_64 with nvptx accelerator and tested libgomp.
Committed.
Thanks,
- Tom
[nvptx] Add -mlong-vector-in-workers
2018-04-05 Tom de Vries <tom@codesourcery.com>
* config/nvptx/nvptx.c (nvptx_adjust_parallelism): Handle
nvptx_long_vectors_in_workers.
* config/nvptx/nvptx.opt (mlong-vector-in-workers): Add option.
* testsuite/libgomp.oacc-c-c++-common/vector-length-128-4.c: New test.
* testsuite/libgomp.oacc-c-c++-common/vector-length-128-5.c: New test.
* testsuite/libgomp.oacc-c-c++-common/vector-length-128-6.c: New test.
* testsuite/libgomp.oacc-c-c++-common/vector-length-128-8.c: New test.
* testsuite/libgomp.oacc-c-c++-common/vector-length-128-9.c: New test.
---
gcc/config/nvptx/nvptx.c | 3 +-
gcc/config/nvptx/nvptx.opt | 3 ++
.../vector-length-128-4.c | 41 ++++++++++++++++++++
.../vector-length-128-5.c | 42 +++++++++++++++++++++
.../vector-length-128-6.c | 42 +++++++++++++++++++++
.../vector-length-128-8.c | 44 ++++++++++++++++++++++
.../vector-length-128-9.c | 44 ++++++++++++++++++++++
7 files changed, 218 insertions(+), 1 deletion(-)
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 595413a..b5e6dce 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -5397,7 +5397,8 @@ nvptx_adjust_parallelism (unsigned inner_mask, unsigned outer_mask)
worker loop. Therefore, fallback to setting vector_length to
PTX_WARP_SIZE. Hopefully this condition may be relaxed for
sm_70+ targets. */
- if ((inner_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+ if (nvptx_long_vectors_in_workers == 0
+ && (inner_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
&& (outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
{
tree attr = tree_cons (get_identifier (NVPTX_GOACC_VL_WARP), NULL_TREE,
diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt
index e2d64bd..f7f37ec 100644
--- a/gcc/config/nvptx/nvptx.opt
+++ b/gcc/config/nvptx/nvptx.opt
@@ -62,3 +62,6 @@ Enum(ptx_isa) String(sm_35) Value(PTX_ISA_SM35)
misa=
Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_SM30)
Specify the version of the ptx ISA to use.
+
+mlong-vector-in-workers
+Target Var(nvptx_long_vectors_in_workers) Undocumented Init(0)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-4.c
new file mode 100644
index 0000000..6d43f82
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-4.c
@@ -0,0 +1,41 @@
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* { dg-additional-options "-foffload=-fdump-tree-oaccdevlow" } */
+/* { dg-additional-options "-foffload=-mlong-vector-in-workers" } */
+/* { dg-set-target-env-var "GOMP_DEBUG" "1" } */
+
+#include <stdlib.h>
+
+#define N 1024
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+unsigned int n = N;
+
+int
+main (void)
+{
+ for (unsigned int i = 0; i < n; ++i)
+ {
+ a[i] = i % 3;
+ b[i] = i % 5;
+ }
+
+#pragma acc parallel num_workers (2) vector_length (128) copyin (a,b) copyout (c)
+ {
+#pragma acc loop worker
+ for (unsigned int i = 0; i < 4; i++)
+#pragma acc loop vector
+ for (unsigned int j = 0; j < n / 4; j++)
+ c[(i * N / 4) + j] = a[(i * N / 4) + j] + b[(i * N / 4) + j];
+ }
+
+ for (unsigned int i = 0; i < n; ++i)
+ if (c[i] != (i % 3) + (i % 5))
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 2, 128\\)" "oaccdevlow" } } */
+/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=2, vectors=128" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-5.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-5.c
new file mode 100644
index 0000000..661fdc7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-5.c
@@ -0,0 +1,42 @@
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* { dg-additional-options "-fopenacc-dim=-:2:128" } */
+/* { dg-additional-options "-foffload=-fdump-tree-oaccdevlow" } */
+/* { dg-additional-options "-foffload=-mlong-vector-in-workers" } */
+/* { dg-set-target-env-var "GOMP_DEBUG" "1" } */
+
+#include <stdlib.h>
+
+#define N 1024
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+unsigned int n = N;
+
+int
+main (void)
+{
+ for (unsigned int i = 0; i < n; ++i)
+ {
+ a[i] = i % 3;
+ b[i] = i % 5;
+ }
+
+#pragma acc parallel copyin (a,b) copyout (c)
+ {
+#pragma acc loop worker
+ for (unsigned int i = 0; i < 4; i++)
+#pragma acc loop vector
+ for (unsigned int j = 0; j < n / 4; j++)
+ c[(i * N / 4) + j] = a[(i * N / 4) + j] + b[(i * N / 4) + j];
+ }
+
+ for (unsigned int i = 0; i < n; ++i)
+ if (c[i] != (i % 3) + (i % 5))
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 2, 128\\)" "oaccdevlow" } } */
+/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=2, vectors=128" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-6.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-6.c
new file mode 100644
index 0000000..91f611e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-6.c
@@ -0,0 +1,42 @@
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* { dg-set-target-env-var "GOMP_OPENACC_DIM" ":2:" } */
+/* { dg-additional-options "-foffload=-fdump-tree-oaccdevlow" } */
+/* { dg-additional-options "-foffload=-mlong-vector-in-workers" } */
+/* { dg-set-target-env-var "GOMP_DEBUG" "1" } */
+
+#include <stdlib.h>
+
+#define N 1024
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+unsigned int n = N;
+
+int
+main (void)
+{
+ for (unsigned int i = 0; i < n; ++i)
+ {
+ a[i] = i % 3;
+ b[i] = i % 5;
+ }
+
+#pragma acc parallel vector_length (128) copyin (a,b) copyout (c)
+ {
+#pragma acc loop worker
+ for (unsigned int i = 0; i < 4; i++)
+#pragma acc loop vector
+ for (unsigned int j = 0; j < n / 4; j++)
+ c[(i * N / 4) + j] = a[(i * N / 4) + j] + b[(i * N / 4) + j];
+ }
+
+ for (unsigned int i = 0; i < n; ++i)
+ if (c[i] != (i % 3) + (i % 5))
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 0, 128\\)" "oaccdevlow" } } */
+/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=2, vectors=128" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-8.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-8.c
new file mode 100644
index 0000000..6246067
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-8.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* { dg-additional-options "-foffload=-fdump-tree-oaccdevlow" } */
+/* { dg-additional-options "-foffload=-mlong-vector-in-workers" } */
+/* { dg-additional-options "-fopenacc-dim=-:-:-" } */
+/* { dg-set-target-env-var "GOMP_DEBUG" "1" } */
+
+#include <stdlib.h>
+
+#define N 1024
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+unsigned int n = N;
+
+int
+main (void)
+{
+ for (unsigned int i = 0; i < n; ++i)
+ {
+ a[i] = i % 3;
+ b[i] = i % 5;
+ }
+
+#pragma acc parallel copyin (a,b) copyout (c)
+ {
+#pragma acc loop worker
+ for (unsigned int i = 0; i < 4; i++)
+#pragma acc loop vector
+ for (unsigned int j = 0; j < n / 4; j++)
+ c[(i * N / 4) + j] = a[(i * N / 4) + j] + b[(i * N / 4) + j];
+ }
+
+ for (unsigned int i = 0; i < n; ++i)
+ if (c[i] != (i % 3) + (i % 5))
+ abort ();
+
+ return 0;
+}
+
+/* { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" } */
+
+/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 0, 32\\)" "oaccdevlow" } } */
+/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=32, vectors=32" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-9.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-9.c
new file mode 100644
index 0000000..2f8b4b7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-9.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* { dg-additional-options "-foffload=-fdump-tree-oaccdevlow" } */
+/* { dg-additional-options "-foffload=-mlong-vector-in-workers" } */
+/* { dg-additional-options "-fopenacc-dim=-:8:-" } */
+/* { dg-set-target-env-var "GOMP_DEBUG" "1" } */
+
+#include <stdlib.h>
+
+#define N 1024
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+unsigned int n = N;
+
+int
+main (void)
+{
+ for (unsigned int i = 0; i < n; ++i)
+ {
+ a[i] = i % 3;
+ b[i] = i % 5;
+ }
+
+#pragma acc parallel copyin (a,b) copyout (c)
+ {
+#pragma acc loop worker
+ for (unsigned int i = 0; i < 4; i++)
+#pragma acc loop vector
+ for (unsigned int j = 0; j < n / 4; j++)
+ c[(i * N / 4) + j] = a[(i * N / 4) + j] + b[(i * N / 4) + j];
+ }
+
+ for (unsigned int i = 0; i < n; ++i)
+ if (c[i] != (i % 3) + (i % 5))
+ abort ();
+
+ return 0;
+}
+
+/* { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" } */
+
+/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 8, 32\\)" "oaccdevlow" } } */
+/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=8, vectors=32" } */