This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[OpenACC] Enable firstprivate OpenACC reductions
- From: Cesar Philippidis <cesar at codesourcery dot com>
- To: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>, Jakub Jelinek <jakub at redhat dot com>
- Date: Wed, 5 Sep 2018 13:08:21 -0700
- Subject: [OpenACC] Enable firstprivate OpenACC reductions
This patch teaches the gimplifier how to pass certain OpenACC reduction
variables as firstprivate, and not with an implicit copy directive. This
is matches the default behavior for the implicit data mappings of scalar
variables inside OpenACC parallel regions. It should be noted that the
gimplifier will still implicitly map reduction variables on loops
immediately enclosed inside a parallel regions, like so
#pragma acc parallel
#pragma acc loop reduction(+:sum)
as copy. This change only impacts reductions variables inside nested acc
loops like
#pragma acc parallel
#pragma acc loop
for (...)
{
#pragma acc loop reduction(+:s2)
Here s2 will be transferred into the accelerator as firstprivate instead
of copy.
Is this OK for trunk? I regtested and bootstrapped for x86_64 with nvptx
offloading.
Cesar
[OpenACC] Enable firstprivate OpenACC reductions
2018-XX-YY Cesar Philippidis <cesar@codesourcery.com>
Chung-Lin Tang <cltang@codesourcery.com>
gcc/
* gimplify.c (omp_add_variable): Enable firstprivate reduction
variables.
gcc/testsuite/
* c-c++-common/goacc/reduction-8.c: New test.
libgomp/
* testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c: New
test.
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index dbd0f0ebd0c..4d954e20788 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -6823,20 +6823,27 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
else
splay_tree_insert (ctx->variables, (splay_tree_key)decl, flags);
- /* For reductions clauses in OpenACC loop directives, by default create a
- copy clause on the enclosing parallel construct for carrying back the
- results. */
+ /* For OpenACC loop directives, when a reduction clause is placed on
+ the outermost acc loop within an acc parallel or kernels
+ construct, it must have an implied copy data mapping. E.g.
+
+ #pragma acc parallel
+ {
+ #pragma acc loop reduction (+:sum)
+
+ a copy clause for sum should be added on the enclosing parallel
+ construct for carrying back the results. */
if (ctx->region_type == ORT_ACC && (flags & GOVD_REDUCTION))
{
struct gimplify_omp_ctx *outer_ctx = ctx->outer_context;
- while (outer_ctx)
+ if (outer_ctx)
{
n = splay_tree_lookup (outer_ctx->variables, (splay_tree_key)decl);
if (n != NULL)
{
/* Ignore local variables and explicitly declared clauses. */
if (n->value & (GOVD_LOCAL | GOVD_EXPLICIT))
- break;
+ ;
else if (outer_ctx->region_type == ORT_ACC_KERNELS)
{
/* According to the OpenACC spec, such a reduction variable
@@ -6856,9 +6863,7 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree decl, unsigned int flags)
{
splay_tree_insert (outer_ctx->variables, (splay_tree_key)decl,
GOVD_MAP | GOVD_SEEN);
- break;
}
- outer_ctx = outer_ctx->outer_context;
}
}
}
diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-8.c b/gcc/testsuite/c-c++-common/goacc/reduction-8.c
new file mode 100644
index 00000000000..8a0283f4ac3
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/reduction-8.c
@@ -0,0 +1,94 @@
+/* { dg-additional-options "-fdump-tree-gimple" } */
+
+#define n 1000
+
+int
+main(void)
+{
+ int i, j;
+ int result, array[n];
+
+#pragma acc parallel loop reduction (+:result)
+ for (i = 0; i < n; i++)
+ result ++;
+
+#pragma acc parallel
+#pragma acc loop reduction (+:result)
+ for (i = 0; i < n; i++)
+ result ++;
+
+#pragma acc parallel
+#pragma acc loop
+ for (i = 0; i < n; i++)
+ {
+ result = i;
+
+#pragma acc loop reduction(+:result)
+ for (j = 0; j < n; j++)
+ result ++;
+
+ array[i] = result;
+ }
+
+#pragma acc parallel
+#pragma acc loop
+ for (i = 0; i < n; i++)
+ {
+ result = i;
+
+#pragma acc loop worker vector reduction(+:result)
+ for (j = 0; j < n; j++)
+ result ++;
+
+ array[i] = result;
+ }
+
+#pragma acc parallel
+#pragma acc loop // { dg-warning "insufficient partitioning" }
+ for (i = 0; i < n; i++)
+ {
+ result = i;
+
+#pragma acc loop gang reduction(+:result)
+ for (j = 0; j < n; j++)
+ result ++;
+
+ array[i] = result;
+ }
+
+#pragma acc parallel copy(result)
+#pragma acc loop // { dg-warning "insufficient partitioning" }
+ for (i = 0; i < n; i++)
+ {
+ result = i;
+
+#pragma acc loop gang reduction(+:result)
+ for (j = 0; j < n; j++)
+ result ++;
+
+ array[i] = result;
+ }
+
+#pragma acc kernels
+#pragma acc loop
+ for (i = 0; i < n; i++)
+ {
+ result = i;
+
+#pragma acc loop reduction(+:result)
+ for (j = 0; j < n; j++)
+ result ++;
+
+ array[i] = result;
+ }
+
+ return 0;
+}
+
+/* Check that default copy maps are generated for loop reductions. */
+/* { dg-final { scan-tree-dump-times "reduction..:result. map.tofrom:result .len: 4.." 1 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel map.tofrom:result .len: 4.." 2 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "map.tofrom:array .len: 4000.. firstprivate.result." 3 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "map.tofrom:result .len: 4.. map.tofrom:array .len: 4000.." 1 "gimple" } } */
+/* { dg-final { scan-tree-dump-times "map.tofrom:array .len: 4000.. map.force_tofrom:result .len: 4.." 1 "gimple" } } */
+
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c
new file mode 100644
index 00000000000..206e66fec79
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-1.c
@@ -0,0 +1,41 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (int argc, char *argv[])
+{
+#define N 100
+ int n = N;
+ int i, j, tmp;
+ int input[N*N], output[N], houtput[N];
+
+ for (i = 0; i < n * n; i++)
+ input[i] = i;
+
+ for (i = 0; i < n; i++)
+ {
+ tmp = 0;
+ for (j = 0; j < n; j++)
+ tmp += input[i * n + j];
+ houtput[i] = tmp;
+ }
+
+ #pragma acc parallel loop gang
+ for (i = 0; i < n; i++)
+ {
+ tmp = 0;
+
+ #pragma acc loop worker reduction(+:tmp)
+ for (j = 0; j < n; j++)
+ tmp += input[i * n + j];
+
+ output[i] = tmp;
+ }
+
+ /* Test if every worker-level reduction had correct private result. */
+ for (i = 0; i < n; i++)
+ if (houtput[i] != output[i])
+ abort ();
+
+ return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c
new file mode 100644
index 00000000000..0c317dcf8a6
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/privatize-reduction-2.c
@@ -0,0 +1,23 @@
+#include <assert.h>
+
+int
+main ()
+{
+ const int n = 1000;
+ int i, j, temp, a[n];
+
+#pragma acc parallel loop
+ for (i = 0; i < n; i++)
+ {
+ temp = i;
+#pragma acc loop reduction (+:temp)
+ for (j = 0; j < n; j++)
+ temp ++;
+ a[i] = temp;
+ }
+
+ for (i = 0; i < n; i++)
+ assert (a[i] == i+n);
+
+ return 0;
+}
--
2.17.1