This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
[patch,openacc] Generate sequential loop for OpenACC loop directive inside kernels
- From: Cesar Philippidis <cesar at codesourcery dot com>
- To: "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>, Thomas Schwinge <thomas at codesourcery dot com>
- Cc: Chung-Lin Tang <cltang at codesourcery dot com>
- Date: Thu, 20 Sep 2018 10:14:45 -0700
- Subject: [patch,openacc] Generate sequential loop for OpenACC loop directive inside kernels
As Chung-Lin noted here
<https://gcc.gnu.org/ml/gcc-patches/2015-06/msg01079.html>:
This patch adjusts omp-low.c:expand_omp_for_generic() to expand to a
"sequential" loop form (without the OMP runtime calls), used for loop
directives inside OpenACC kernels constructs. Tom mentions that this
allows the kernels parallelization to work when '#pragma acc loop'
makes the front-ends create OMP_FOR, which the loop analysis phases
don't understand.
I bootstrapped and regtested it on x86_64 Linux with nvptx offloading.
Is this patch OK for trunk?
Thanks,
Cesar
[OpenACC] Generate sequential loop for OpenACC loop directive inside kernels
2018-XX-YY Chung-Lin Tang <cltang@codesourcery.com>
Cesar Philippidis <cesar@codesourcery.com>
gcc/
* omp-expand.c (struct omp_region): Add inside_kernels_p field.
(expand_omp_for_generic): Adjust to generate a 'sequential' loop
when GOMP builtin arguments are BUILT_IN_NONE.
(expand_omp_for): Use expand_omp_for_generic to generate a
non-parallelized loop for OMP_FORs inside OpenACC kernels regions.
(expand_omp): Mark inside_kernels_p field true for regions
nested inside OpenACC kernels constructs.
gcc/testsuite/
* c-c++-common/goacc/kernels-loop-acc-loop.c: New test.
* c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test.
* c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test.
* c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test.
* c-c++-common/goacc/kernels-acc-loop-reduction.c: New test.
* c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: New test.
(cherry picked from gomp-4_0-branch r224505, r224837, r228232, r228233,
r231461, and r247958)
---
gcc/omp-expand.c | 136 ++++++++++++------
.../goacc/kernels-acc-loop-reduction.c | 23 +++
.../goacc/kernels-acc-loop-smaller-equal.c | 23 +++
.../goacc/kernels-loop-2-acc-loop.c | 18 +++
.../goacc/kernels-loop-3-acc-loop.c | 15 ++
.../goacc/kernels-loop-acc-loop.c | 15 ++
.../goacc/kernels-loop-n-acc-loop.c | 15 ++
7 files changed, 204 insertions(+), 41 deletions(-)
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index d2a77c067c6..9b03f62e065 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -104,6 +104,9 @@ struct omp_region
/* The ordered stmt if type is GIMPLE_OMP_ORDERED and it has
a depend clause. */
gomp_ordered *ord_stmt;
+
+ /* True if this is nested inside an OpenACC kernels construct. */
+ bool inside_kernels_p;
};
static struct omp_region *root_omp_region;
@@ -2509,6 +2512,7 @@ expand_omp_for_generic (struct omp_region *region,
gassign *assign_stmt;
bool in_combined_parallel = is_combined_parallel (region);
bool broken_loop = region->cont == NULL;
+ bool seq_loop = (start_fn == BUILT_IN_NONE || next_fn == BUILT_IN_NONE);
edge e, ne;
tree *counts = NULL;
int i;
@@ -2606,8 +2610,12 @@ expand_omp_for_generic (struct omp_region *region,
type = TREE_TYPE (fd->loop.v);
istart0 = create_tmp_var (fd->iter_type, ".istart0");
iend0 = create_tmp_var (fd->iter_type, ".iend0");
- TREE_ADDRESSABLE (istart0) = 1;
- TREE_ADDRESSABLE (iend0) = 1;
+
+ if (!seq_loop)
+ {
+ TREE_ADDRESSABLE (istart0) = 1;
+ TREE_ADDRESSABLE (iend0) = 1;
+ }
/* See if we need to bias by LLONG_MIN. */
if (fd->iter_type == long_long_unsigned_type_node
@@ -2637,7 +2645,25 @@ expand_omp_for_generic (struct omp_region *region,
gsi_prev (&gsif);
tree arr = NULL_TREE;
- if (in_combined_parallel)
+ if (seq_loop)
+ {
+ tree n1 = fold_convert (fd->iter_type, fd->loop.n1);
+ tree n2 = fold_convert (fd->iter_type, fd->loop.n2);
+
+ n1 = force_gimple_operand_gsi_1 (&gsi, n1, is_gimple_reg, NULL_TREE, true,
+ GSI_SAME_STMT);
+ n2 = force_gimple_operand_gsi_1 (&gsi, n2, is_gimple_reg, NULL_TREE, true,
+ GSI_SAME_STMT);
+
+ assign_stmt = gimple_build_assign (istart0, n1);
+ gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+
+ assign_stmt = gimple_build_assign (iend0, n2);
+ gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+
+ t = fold_build2 (NE_EXPR, boolean_type_node, istart0, iend0);
+ }
+ else if (in_combined_parallel)
{
gcc_assert (fd->ordered == 0);
/* In a combined parallel loop, emit a call to
@@ -3059,39 +3085,45 @@ expand_omp_for_generic (struct omp_region *region,
collapse_bb = extract_omp_for_update_vars (fd, cont_bb, l1_bb);
/* Emit code to get the next parallel iteration in L2_BB. */
- gsi = gsi_start_bb (l2_bb);
+ if (!seq_loop)
+ {
+ gsi = gsi_start_bb (l2_bb);
- t = build_call_expr (builtin_decl_explicit (next_fn), 2,
- build_fold_addr_expr (istart0),
- build_fold_addr_expr (iend0));
- t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
- false, GSI_CONTINUE_LINKING);
- if (TREE_TYPE (t) != boolean_type_node)
- t = fold_build2 (NE_EXPR, boolean_type_node,
- t, build_int_cst (TREE_TYPE (t), 0));
- gcond *cond_stmt = gimple_build_cond_empty (t);
- gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
+ t = build_call_expr (builtin_decl_explicit (next_fn), 2,
+ build_fold_addr_expr (istart0),
+ build_fold_addr_expr (iend0));
+ t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+ false, GSI_CONTINUE_LINKING);
+ if (TREE_TYPE (t) != boolean_type_node)
+ t = fold_build2 (NE_EXPR, boolean_type_node,
+ t, build_int_cst (TREE_TYPE (t), 0));
+ gcond *cond_stmt = gimple_build_cond_empty (t);
+ gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
+ }
}
/* Add the loop cleanup function. */
gsi = gsi_last_nondebug_bb (exit_bb);
- if (gimple_omp_return_nowait_p (gsi_stmt (gsi)))
- t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
- else if (gimple_omp_return_lhs (gsi_stmt (gsi)))
- t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
- else
- t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
- gcall *call_stmt = gimple_build_call (t, 0);
- if (gimple_omp_return_lhs (gsi_stmt (gsi)))
- gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
- gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
- if (fd->ordered)
+ if (!seq_loop)
{
- tree arr = counts[fd->ordered];
- tree clobber = build_constructor (TREE_TYPE (arr), NULL);
- TREE_THIS_VOLATILE (clobber) = 1;
- gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
- GSI_SAME_STMT);
+ if (gimple_omp_return_nowait_p (gsi_stmt (gsi)))
+ t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
+ else if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+ t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
+ else
+ t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
+ gcall *call_stmt = gimple_build_call (t, 0);
+ if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+ gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
+ gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
+ if (fd->ordered)
+ {
+ tree arr = counts[fd->ordered];
+ tree clobber = build_constructor (TREE_TYPE (arr), NULL);
+ TREE_THIS_VOLATILE (clobber) = 1;
+ gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
+ GSI_SAME_STMT);
+ }
}
gsi_remove (&gsi, true);
@@ -3104,7 +3136,8 @@ expand_omp_for_generic (struct omp_region *region,
gimple_seq phis;
e = find_edge (cont_bb, l3_bb);
- ne = make_edge (l2_bb, l3_bb, EDGE_FALSE_VALUE);
+ ne = make_edge (l2_bb, l3_bb,
+ seq_loop ? EDGE_FALLTHRU : EDGE_FALSE_VALUE);
phis = phi_nodes (l3_bb);
for (gsi = gsi_start (phis); !gsi_end_p (gsi); gsi_next (&gsi))
@@ -3144,7 +3177,8 @@ expand_omp_for_generic (struct omp_region *region,
e = find_edge (cont_bb, l2_bb);
e->flags = EDGE_FALLTHRU;
}
- make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
+ if (!seq_loop)
+ make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
if (gimple_in_ssa_p (cfun))
{
@@ -3203,12 +3237,16 @@ expand_omp_for_generic (struct omp_region *region,
add_bb_to_loop (l2_bb, outer_loop);
- /* We've added a new loop around the original loop. Allocate the
- corresponding loop struct. */
- struct loop *new_loop = alloc_loop ();
- new_loop->header = l0_bb;
- new_loop->latch = l2_bb;
- add_loop (new_loop, outer_loop);
+ struct loop *new_loop = NULL;
+ if (!seq_loop)
+ {
+ /* We've added a new loop around the original loop. Allocate the
+ corresponding loop struct. */
+ new_loop = alloc_loop ();
+ new_loop->header = l0_bb;
+ new_loop->latch = l2_bb;
+ add_loop (new_loop, outer_loop);
+ }
/* Allocate a loop structure for the original loop unless we already
had one. */
@@ -3218,7 +3256,8 @@ expand_omp_for_generic (struct omp_region *region,
struct loop *orig_loop = alloc_loop ();
orig_loop->header = l1_bb;
/* The loop may have multiple latches. */
- add_loop (orig_loop, new_loop);
+ add_loop (orig_loop,
+ new_loop != NULL ? new_loop : outer_loop);
}
}
}
@@ -5665,7 +5704,10 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
original loops from being detected. Fix that up. */
loops_state_set (LOOPS_NEED_FIXUP);
- if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
+ if (region->inside_kernels_p)
+ expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
+ inner_stmt);
+ else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
expand_omp_simd (region, &fd);
else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
{
@@ -7750,7 +7792,19 @@ expand_omp (struct omp_region *region)
if (region->type == GIMPLE_OMP_PARALLEL)
determine_parallel_type (region);
else if (region->type == GIMPLE_OMP_TARGET)
- grid_expand_target_grid_body (region);
+ {
+ grid_expand_target_grid_body (region);
+
+ if (region->inner)
+ {
+ gomp_target *entry
+ = as_a <gomp_target *> (last_stmt (region->entry));
+ if (region->inside_kernels_p
+ || (gimple_omp_target_kind (entry)
+ == GF_OMP_TARGET_KIND_OACC_KERNELS))
+ region->inner->inside_kernels_p = true;
+ }
+ }
if (region->type == GIMPLE_OMP_FOR
&& gimple_omp_for_combined_p (last_stmt (region->entry)))
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
new file mode 100644
index 00000000000..4824e530925
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
@@ -0,0 +1,23 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+unsigned int
+foo (int n, unsigned int *a)
+{
+ unsigned int sum = 0;
+
+#pragma acc kernels loop gang reduction(+:sum)
+ for (int i = 0; i < n; i++)
+ sum += a[i];
+
+ return sum;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
new file mode 100644
index 00000000000..d70afb0e662
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
@@ -0,0 +1,23 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+unsigned int
+foo (int n)
+{
+ unsigned int sum = 1;
+
+ #pragma acc kernels loop
+ for (int i = 1; i <= n; i++)
+ sum += i;
+
+ return sum;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
new file mode 100644
index 00000000000..7b127cb6fd9
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
@@ -0,0 +1,18 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */
+#define ACC_LOOP
+#include "kernels-loop-2.c"
+
+/* Check that only three loops are analyzed, and that all can be
+ parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
new file mode 100644
index 00000000000..a040e096fc1
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
@@ -0,0 +1,15 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */
+#define ACC_LOOP
+#include "kernels-loop-3.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
new file mode 100644
index 00000000000..070a5b5bf3d
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
@@ -0,0 +1,15 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */
+#define ACC_LOOP
+#include "kernels-loop.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c
new file mode 100644
index 00000000000..1f25e63fbbb
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c
@@ -0,0 +1,15 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */
+#define ACC_LOOP
+#include "kernels-loop-n.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */
--
2.17.1