This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH, 7/8] Add pass_parloops_oacc_kernels to pass_oacc_kernels


On 15-11-14 13:14, Tom de Vries wrote:
Hi,

I'm submitting a patch series with initial support for the oacc kernels directive.

The patch series uses pass_parallelize_loops to implement parallelization of
loops in the oacc kernels region.

The patch series consists of these 8 patches:
...
     1  Expand oacc kernels after pass_build_ealias
     2  Add pass_oacc_kernels
     3  Add pass_ch_oacc_kernels to pass_oacc_kernels
     4  Add pass_tree_loop_{init,done} to pass_oacc_kernels
     5  Add pass_loop_im to pass_oacc_kernels
     6  Add pass_ccp to pass_oacc_kernels
     7  Add pass_parloops_oacc_kernels to pass_oacc_kernels
     8  Do simple omp lowering for no address taken var
...

This patch adds:
- a specialized version of pass_parallelize_loops called
    pass_parloops_oacc_kernels to pass group pass_oacc_kernels, and
- relevant test-cases.

The pass only handles loops that are in a kernels region, and skips over bits of pass_parallelize_loops that are already done for oacc kernels.

The pass reintroduces the use of omp_expand_local, I haven't managed to make it work yet using the external pass pass_expand_omp_ssa.

An obvious limitation of the patch is the fact that we copy over the clauses from the kernels directive to the generated parallel directive. We'll need to do something more intelligent here, f.i. setting vector_length based on the parallelization factor.

Another limitation is that the pass still needs -ftree-parallelize-loops to trigger.

OK for trunk?

Thanks,
- Tom

2014-11-14  Tom de Vries  <tom@codesourcery.com>

	* passes.def: Add pass_parallelize_loops_oacc_kernels in pass group
	pass_oacc_kernels.  Move pass_expand_omp_ssa into pass group
	pass_oacc_kernels.
	* tree-parloops.c (create_parallel_loop): Add function parameters
	region_entry and bool oacc_kernels_p.  Handle oacc_kernels_p.
	(gen_parallel_loop): Same.  Use omp_expand_local if oacc_kernels_p.
	Call create_parallel_loop with additional args.
	(parallelize_loops): Add function parameter oacc_kernels_p.  Calculate
	dominance info.  Skip loops that are not in a kernels region. Call
	gen_parallel_loop with additional args.
	(pass_parallelize_loops::execute): Call parallelize_loops with false
	argument.
	(pass_data_parallelize_loops_oacc_kernels): New pass_data.
	(class pass_parallelize_loops_oacc_kernels): New pass.
	(pass_parallelize_loops_oacc_kernels::execute)
	(make_pass_parallelize_loops_oacc_kernels): New function.
	* tree-pass.h (make_pass_parallelize_loops_oacc_kernels): Declare.

	* testsuite/libgomp.oacc-c/oacc-kernels-2-run.c: New test.
	* testsuite/libgomp.oacc-c/oacc-kernels-run.c: New test.

	* gcc.dg/oacc-kernels-2.c: New test.
	* gcc.dg/oacc-kernels.c: New test.
---
 gcc/passes.def                                     |   3 +-
 gcc/testsuite/gcc.dg/oacc-kernels-2.c              |  79 +++++++
 gcc/testsuite/gcc.dg/oacc-kernels.c                |  71 ++++++
 gcc/tree-parloops.c                                | 242 ++++++++++++++++-----
 gcc/tree-pass.h                                    |   2 +
 .../testsuite/libgomp.oacc-c/oacc-kernels-2-run.c  |  65 ++++++
 .../testsuite/libgomp.oacc-c/oacc-kernels-run.c    |  59 +++++
 7 files changed, 465 insertions(+), 56 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/oacc-kernels-2.c
 create mode 100644 gcc/testsuite/gcc.dg/oacc-kernels.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c/oacc-kernels-2-run.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c/oacc-kernels-run.c

diff --git a/gcc/passes.def b/gcc/passes.def
index cd9443c..cc09ba9 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -80,9 +80,10 @@ along with GCC; see the file COPYING3.  If not see
 	      NEXT_PASS (pass_tree_loop_init);
 	      NEXT_PASS (pass_lim);
 	      NEXT_PASS (pass_ccp);
+      	      NEXT_PASS (pass_parallelize_loops_oacc_kernels);
+	      NEXT_PASS (pass_expand_omp_ssa);
 	      NEXT_PASS (pass_tree_loop_done);
 	  POP_INSERT_PASSES ()
-	  NEXT_PASS (pass_expand_omp_ssa);
 	  NEXT_PASS (pass_fre);
 	  NEXT_PASS (pass_merge_phi);
 	  NEXT_PASS (pass_cd_dce);
diff --git a/gcc/testsuite/gcc.dg/oacc-kernels-2.c b/gcc/testsuite/gcc.dg/oacc-kernels-2.c
new file mode 100644
index 0000000..1ff4bad
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/oacc-kernels-2.c
@@ -0,0 +1,79 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target fopenacc } */
+/* { dg-options "-fopenacc -ftree-parallelize-loops=32 -O2 -std=c99 -fdump-tree-parloops_oacc_kernels-all -fdump-tree-copyrename" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+  unsigned int i;
+
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = malloc (N * sizeof (unsigned int));
+  b = malloc (N * sizeof (unsigned int));
+  c = malloc (N * sizeof (unsigned int));
+
+
+#pragma acc kernels copyout (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc kernels copyout (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+  {
+    unsigned int sum = 0;
+
+    for (COUNTERTYPE i = 0; i < N; i++)
+      sum += c[i];
+
+    printf ("sum: %u\n", sum);
+
+    if (sum != N_REF)
+      abort ();
+  }
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only three loops are analyzed, and that all can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  It pops up first in
+   all_passes/pass_all_optimizations/pass_rename_ssa_copies.  */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.0 " 1 "copyrename2" } } */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.1 " 1 "copyrename2" } } */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.2 " 1 "copyrename2" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "copyrename*" } } */
diff --git a/gcc/testsuite/gcc.dg/oacc-kernels.c b/gcc/testsuite/gcc.dg/oacc-kernels.c
new file mode 100644
index 0000000..de94aa9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/oacc-kernels.c
@@ -0,0 +1,71 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target fopenacc } */
+/* { dg-options "-fopenacc -ftree-parallelize-loops=32 -O2 -std=c99 -fdump-tree-parloops_oacc_kernels-all -fdump-tree-copyrename" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+  unsigned int i;
+
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = malloc (N * sizeof (unsigned int));
+  b = malloc (N * sizeof (unsigned int));
+  c = malloc (N * sizeof (unsigned int));
+
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    a[i] = i * 2;
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+  {
+    unsigned int sum = 0;
+
+    for (COUNTERTYPE i = 0; i < N; i++)
+      sum += c[i];
+
+    printf ("sum: %u\n", sum);
+
+    if (sum != N_REF)
+      abort ();
+  }
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function.  It pops up first in
+   all_passes/pass_all_optimizations/pass_rename_ssa_copies.  */
+/* { dg-final { scan-tree-dump-times "Function main._omp_fn.0 " 1 "copyrename2" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "copyrename*" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index e5dca78..7bc945b 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -1611,7 +1611,8 @@ transform_to_exit_first_loop (struct loop *loop,
 
 static basic_block
 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
-		      tree new_data, unsigned n_threads, location_t loc)
+		      tree new_data, unsigned n_threads, location_t loc,
+		      basic_block region_entry, bool oacc_kernels_p)
 {
   gimple_stmt_iterator gsi;
   basic_block bb, paral_bb, for_bb, ex_bb;
@@ -1623,15 +1624,44 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
   /* Prepare the GIMPLE_OMP_PARALLEL statement.  */
   bb = loop_preheader_edge (loop)->src;
   paral_bb = single_pred (bb);
-  gsi = gsi_last_bb (paral_bb);
+  if (!oacc_kernels_p)
+    gsi = gsi_last_bb (paral_bb);
+  else
+    /* Make sure the oacc parallel is inserted on top of the oacc kernels
+       region.  */
+    gsi = gsi_last_bb (region_entry);
 
-  t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
-  OMP_CLAUSE_NUM_THREADS_EXPR (t)
-    = build_int_cst (integer_type_node, n_threads);
-  stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
-  gimple_set_location (stmt, loc);
+  if (!oacc_kernels_p)
+    {
+      t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
+      OMP_CLAUSE_NUM_THREADS_EXPR (t)
+	= build_int_cst (integer_type_node, n_threads);
+      stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+      gimple_set_location (stmt, loc);
 
-  gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+    }
+  else
+    {
+      /* Create oacc parallel pragma based on oacc kernels pragma.  */
+      gimple kernels = last_stmt (region_entry);
+      stmt = gimple_build_oacc_parallel (NULL,
+					 gimple_oacc_kernels_clauses (kernels));
+      tree child_fn = gimple_oacc_kernels_child_fn (kernels);
+      gimple_oacc_parallel_set_child_fn (stmt, child_fn);
+      tree data_arg = gimple_oacc_kernels_data_arg (kernels);
+      gimple_oacc_parallel_set_data_arg (stmt, data_arg);
+
+      gimple_set_location (stmt, loc);
+
+      /* Insert oacc parallel pragma after the oacc kernels pragma.  */
+      {
+	gimple_stmt_iterator gsi2;
+	gsi2 = gsi;
+	gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+	gsi_remove (&gsi2, true);
+      }
+    }
 
   /* Initialize NEW_DATA.  */
   if (data)
@@ -1647,12 +1677,18 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
     }
 
-  /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
-  bb = split_loop_exit_edge (single_dom_exit (loop));
-  gsi = gsi_last_bb (bb);
-  stmt = gimple_build_omp_return (false);
-  gimple_set_location (stmt, loc);
-  gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+  /* Skip insertion of OMP_RETURN for oacc_kernels_p.  We've already generated
+     one when lowering the oacc kernels directive in
+     pass_lower_omp/lower_omp (). */
+  if (!oacc_kernels_p)
+    {
+      /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
+      bb = split_loop_exit_edge (single_dom_exit (loop));
+      gsi = gsi_last_bb (bb);
+      stmt = gimple_build_omp_return (false);
+      gimple_set_location (stmt, loc);
+      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+    }
 
   /* Extract data for GIMPLE_OMP_FOR.  */
   gcc_assert (loop->header == single_dom_exit (loop)->src);
@@ -1705,7 +1741,11 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
   t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
   OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
 
-  for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL);
+  for_stmt = gimple_build_omp_for (NULL,
+				   (oacc_kernels_p
+				    ? GF_OMP_FOR_KIND_OACC_LOOP
+				    : GF_OMP_FOR_KIND_FOR),
+				   NULL_TREE, 1, NULL);
   gimple_set_location (for_stmt, loc);
   gimple_omp_for_set_index (for_stmt, 0, initvar);
   gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
@@ -1736,7 +1776,7 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
   free_dominance_info (CDI_DOMINATORS);
   calculate_dominance_info (CDI_DOMINATORS);
 
-  return paral_bb;
+  return oacc_kernels_p ? region_entry : paral_bb;
 }
 
 /* Generates code to execute the iterations of LOOP in N_THREADS
@@ -1748,11 +1788,13 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
 static void
 gen_parallel_loop (struct loop *loop,
 		   reduction_info_table_type *reduction_list,
-		   unsigned n_threads, struct tree_niter_desc *niter)
+		   unsigned n_threads, struct tree_niter_desc *niter,
+		   basic_block region_entry, bool oacc_kernels_p)
 {
   tree many_iterations_cond, type, nit;
   tree arg_struct, new_arg_struct;
   gimple_seq stmts;
+  basic_block parallel_head;
   edge entry, exit;
   struct clsn_data clsn_data;
   unsigned prob;
@@ -1829,40 +1871,43 @@ gen_parallel_loop (struct loop *loop,
   if (stmts)
     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
 
-  if (loop->inner)
-    m_p_thread=2;
-  else
-    m_p_thread=MIN_PER_THREAD;
-
-   many_iterations_cond =
-     fold_build2 (GE_EXPR, boolean_type_node,
-                nit, build_int_cst (type, m_p_thread * n_threads));
-
-  many_iterations_cond
-    = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
-		   invert_truthvalue (unshare_expr (niter->may_be_zero)),
-		   many_iterations_cond);
-  many_iterations_cond
-    = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
-  if (stmts)
-    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
-  if (!is_gimple_condexpr (many_iterations_cond))
+  if (!oacc_kernels_p)
     {
+      if (loop->inner)
+	m_p_thread=2;
+      else
+	m_p_thread=MIN_PER_THREAD;
+
+      many_iterations_cond =
+	fold_build2 (GE_EXPR, boolean_type_node,
+		     nit, build_int_cst (type, m_p_thread * n_threads));
+
+      many_iterations_cond
+	= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+		       invert_truthvalue (unshare_expr (niter->may_be_zero)),
+		       many_iterations_cond);
       many_iterations_cond
-	= force_gimple_operand (many_iterations_cond, &stmts,
-				true, NULL_TREE);
+	= force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
       if (stmts)
 	gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
-    }
+      if (!is_gimple_condexpr (many_iterations_cond))
+	{
+	  many_iterations_cond
+	    = force_gimple_operand (many_iterations_cond, &stmts,
+				    true, NULL_TREE);
+	  if (stmts)
+	    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+	}
 
-  initialize_original_copy_tables ();
+      initialize_original_copy_tables ();
 
-  /* We assume that the loop usually iterates a lot.  */
-  prob = 4 * REG_BR_PROB_BASE / 5;
-  loop_version (loop, many_iterations_cond, NULL,
-		prob, prob, REG_BR_PROB_BASE - prob, true);
-  update_ssa (TODO_update_ssa);
-  free_original_copy_tables ();
+      /* We assume that the loop usually iterates a lot.  */
+      prob = 4 * REG_BR_PROB_BASE / 5;
+      loop_version (loop, many_iterations_cond, NULL,
+		    prob, prob, REG_BR_PROB_BASE - prob, true);
+      update_ssa (TODO_update_ssa);
+      free_original_copy_tables ();
+    }
 
   /* Base all the induction variables in LOOP on a single control one.  */
   canonicalize_loop_ivs (loop, &nit, true);
@@ -1879,19 +1924,31 @@ gen_parallel_loop (struct loop *loop,
   entry = loop_preheader_edge (loop);
   exit = single_dom_exit (loop);
 
-  eliminate_local_variables (entry, exit);
-  /* In the old loop, move all variables non-local to the loop to a structure
-     and back, and create separate decls for the variables used in loop.  */
-  separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
-			    &new_arg_struct, &clsn_data);
+  /* This rewrites the body in terms of new variables.  This has already
+     been done for oacc_kernels_p in pass_lower_omp/lower_omp ().  */
+  if (!oacc_kernels_p)
+    {
+      eliminate_local_variables (entry, exit);
+      /* In the old loop, move all variables non-local to the loop to a
+	 structure and back, and create separate decls for the variables used in
+	 loop.  */
+      separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
+				&new_arg_struct, &clsn_data);
+    }
+  else
+    {
+      arg_struct = NULL_TREE;
+      new_arg_struct = NULL_TREE;
+    }
 
   /* Create the parallel constructs.  */
   loc = UNKNOWN_LOCATION;
   cond_stmt = last_stmt (loop->header);
   if (cond_stmt)
     loc = gimple_location (cond_stmt);
-  create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
-			new_arg_struct, n_threads, loc);
+  parallel_head = create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
+					new_arg_struct, n_threads, loc,
+					region_entry, oacc_kernels_p);
   if (reduction_list->elements () > 0)
     create_call_for_reduction (loop, reduction_list, &clsn_data);
 
@@ -1905,6 +1962,16 @@ gen_parallel_loop (struct loop *loop,
      removed statements.  */
   FOR_EACH_LOOP (loop, 0)
     free_numbers_of_iterations_estimates_loop (loop);
+
+  if (oacc_kernels_p)
+    {
+      /* Expand the parallel constructs.  We do it directly here instead of
+	 running a separate expand_omp pass, since it is more efficient, and
+	 less likely to cause troubles with further analyses not being able to
+	 deal with the OMP trees.  */
+
+      omp_expand_local (parallel_head);
+    }
 }
 
 /* Returns true when LOOP contains vector phi nodes.  */
@@ -2131,7 +2198,7 @@ try_create_reduction_list (loop_p loop,
    otherwise.  */
 
 bool
-parallelize_loops (void)
+parallelize_loops (bool oacc_kernels_p)
 {
   unsigned n_threads = flag_tree_parallelize_loops;
   bool changed = false;
@@ -2140,6 +2207,7 @@ parallelize_loops (void)
   struct obstack parloop_obstack;
   HOST_WIDE_INT estimated;
   source_location loop_loc;
+  basic_block region_entry, region_exit;
 
   /* Do not parallelize loops in the functions created by parallelization.  */
   if (parallelized_function_p (cfun->decl))
@@ -2151,9 +2219,25 @@ parallelize_loops (void)
   reduction_info_table_type reduction_list (10);
   init_stmt_vec_info_vec ();
 
+  calculate_dominance_info (CDI_DOMINATORS);
+
   FOR_EACH_LOOP (loop, 0)
     {
       reduction_list.empty ();
+
+      if (oacc_kernels_p)
+	{
+	  if (!loop_in_oacc_kernels_region_p (loop, &region_entry, &region_exit))
+	    continue;
+	  else
+	    {
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+		fprintf (dump_file,
+			 "Trying loop %d with header bb %d in oacc kernels region\n",
+			 loop->num, loop->header->index);
+	    }
+	}
+
       if (dump_file && (dump_flags & TDF_DETAILS))
       {
         fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
@@ -2223,8 +2307,9 @@ parallelize_loops (void)
 	  fprintf (dump_file, "\nloop at %s:%d: ",
 		   LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc));
       }
+
       gen_parallel_loop (loop, &reduction_list,
-			 n_threads, &niter_desc);
+			 n_threads, &niter_desc, region_entry, oacc_kernels_p);
     }
 
   free_stmt_vec_info_vec ();
@@ -2275,7 +2360,7 @@ pass_parallelize_loops::execute (function *fun)
   if (number_of_loops (fun) <= 1)
     return 0;
 
-  if (parallelize_loops ())
+  if (parallelize_loops (false))
     {
       fun->curr_properties &= ~(PROP_gimple_eomp);
       return TODO_update_ssa;
@@ -2293,4 +2378,51 @@ make_pass_parallelize_loops (gcc::context *ctxt)
 }
 
 
+namespace {
+
+const pass_data pass_data_parallelize_loops_oacc_kernels =
+{
+  GIMPLE_PASS, /* type */
+  "parloops_oacc_kernels", /* name */
+  OPTGROUP_LOOP, /* optinfo_flags */
+  TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
+  ( PROP_cfg | PROP_ssa ), /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_parallelize_loops_oacc_kernels : public gimple_opt_pass
+{
+public:
+  pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_parallelize_loops_oacc_kernels, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *) { return flag_tree_parallelize_loops > 1; }
+  virtual unsigned int execute (function *);
+
+}; // class pass_parallelize_loops_oacc_kernels
+
+unsigned
+pass_parallelize_loops_oacc_kernels::execute (function *fun)
+{
+  if (number_of_loops (fun) <= 1)
+    return 0;
+
+  if (parallelize_loops (true))
+    return TODO_cleanup_cfg | TODO_rebuild_alias;
+  return 0;
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+{
+  return new pass_parallelize_loops_oacc_kernels (ctxt);
+}
+
 #include "gt-tree-parloops.h"
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 1f599fa..e769e4f 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -370,6 +370,8 @@ extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
+extern gimple_opt_pass *
+  make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt);
diff --git a/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-2-run.c b/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-2-run.c
new file mode 100644
index 0000000..5cdae0b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-2-run.c
@@ -0,0 +1,65 @@
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2 -std=c99" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+  unsigned int i;
+
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = malloc (N * sizeof (unsigned int));
+  b = malloc (N * sizeof (unsigned int));
+  c = malloc (N * sizeof (unsigned int));
+
+
+#pragma acc kernels copyout (a[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      a[i] = i * 2;
+  }
+
+#pragma acc kernels copyout (b[0:N])
+  {
+    for (COUNTERTYPE i = 0; i < N; i++)
+      b[i] = i * 4;
+  }
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+  {
+    unsigned int sum = 0;
+
+    for (COUNTERTYPE i = 0; i < N; i++)
+      sum += c[i];
+
+    printf ("sum: %u\n", sum);
+
+    if (sum != N_REF)
+      abort ();
+  }
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-run.c b/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-run.c
new file mode 100644
index 0000000..b9e62a0
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/oacc-kernels-run.c
@@ -0,0 +1,59 @@
+/* { dg-do run } */
+/* { dg-options "-ftree-parallelize-loops=32 -O2 -std=c99" } */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define N (1024 * 512)
+#define N_REF 4293394432
+
+#if 1
+#define COUNTERTYPE unsigned int
+#else
+#define COUNTERTYPE int
+#endif
+
+int
+main (void)
+{
+  unsigned int i;
+
+  unsigned int *__restrict a;
+  unsigned int *__restrict b;
+  unsigned int *__restrict c;
+
+  a = malloc (N * sizeof (unsigned int));
+  b = malloc (N * sizeof (unsigned int));
+  c = malloc (N * sizeof (unsigned int));
+
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    a[i] = i * 2;
+
+  for (COUNTERTYPE i = 0; i < N; i++)
+    b[i] = i * 4;
+
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  {
+    for (COUNTERTYPE ii = 0; ii < N; ii++)
+      c[ii] = a[ii] + b[ii];
+  }
+
+  {
+    unsigned int sum = 0;
+
+    for (COUNTERTYPE i = 0; i < N; i++)
+      sum += c[i];
+
+    printf ("sum: %u\n", sum);
+
+    if (sum != N_REF)
+      abort ();
+  }
+
+  free (a);
+  free (b);
+  free (c);
+
+  return 0;
+}
-- 
1.9.1






Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]