This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[gomp4] acc reductions with multiple variables


This patch does the following:

  * introduce GOACC_get_thread_num / GOACC_get_num_threads and
    GOACC_acc_get_device_type builtins
  * support for num_gangs in the middle end
  * support reductions with multiple variables

This patch also resolves an issue when reductions are preformed on the
host, i.e. ACC_DEVICE_TYPE=host. The problem was, it used to assume that
there were vector_length number of threads. That's fine for the
accelerator, but not for the host where the current implementation only
supports one thread. To get around this problem, this patch uses
GOACC_get_thread_num and GOACC_get_num_threads to determine thread info.
Those functions are internal to libgomp. I probably should implement
them as built-ins for the ptx backend because the thread information are
exposed as registers in that target.

I'll apply this to gomp-4_0-branch next week unless says otherwise.

Cesar
2014-10-31  Cesar Philippidis  <cesar@codesourcery.com>
	    Thomas Schwinge  <thomas@codesourcery.com>

	gcc/
	* oacc-builtins.def (BUILT_IN_GOACC_GET_NUM_THREADS): New built-in
	function.
	(BUILT_IN_ACC_GET_DEVICE_TYPE): Likewise.
	* omp-low.c (oacc_max_threads): New function.
	(lower_reduction_clauses): Use the GOACC thread builtin functions to
	determine the thread count. Handle multiple reduction variables.
	(expand_omp_for_static_nochunk): Likewise.
	(expand_omp_for_static_chunk): Likewise.
	(finalize_reduction_data): General cleanups.
	(process_reduction_data): Use acc_get_device_type to determine nthreads
	at runtime.

	libgomp/
	* libgomp.map (GOACC_get_thread_num): Declare.
	(GOACC_get_num_threads): Declare.
	* libgomp_g.h (GOACC_get_thread_num): Declare.
	(GOACC_get_num_threads): Declare.
	* oacc-parallel.c (GOACC_parallel): Handle num_gangs.
	(GOACC_get_num_threads): New function.
	(GOACC_get_thread_num): New function.
	* oacc-ptx.h: New file.
	* plugin-nvptx.c (ABORT_PTX): Remove macro. Move to oacc-ptx.h.
	(ACC_ON_DEVICE_PTX): Likewise.
	(link_ptx): Also link ptx code defined by GOACC_INTERNAL_PTX.
	(PTX_exec): Handle gangs/CTAs.
	* testsuite/libgomp.oacc-c/reduction-1.c: New test.
	* testsuite/libgomp.oacc-c/reduction-2.c: New test.
	* testsuite/libgomp.oacc-c/reduction-3.c: New test.
	* testsuite/libgomp.oacc-c/reduction-4.c: New test.
	* testsuite/libgomp.oacc-c/reduction-5.: New test.
	* testsuite/libgomp.oacc-c/reduction-initial-1.c: New test.
	* testsuite/libgomp.oacc-fortran/reduction-1.f90: New test.
	* testsuite/libgomp.oacc-fortran/reduction-2.f90: New test.
	* testsuite/libgomp.oacc-fortran/reduction-3.f90: New test.
	* testsuite/libgomp.oacc-fortran/reduction-4.f90: New test.
	* testsuite/libgomp.oacc-fortran/reduction-5.f90: New test.
	* testsuite/libgomp.oacc-fortran/reduction-6.f90: New test.


diff --git a/gcc/oacc-builtins.def b/gcc/oacc-builtins.def
index 080611b..7f4c557 100644
--- a/gcc/oacc-builtins.def
+++ b/gcc/oacc-builtins.def
@@ -27,6 +27,8 @@ along with GCC; see the file COPYING3.  If not see
 
    See builtins.def for details.  */
 
+DEF_GOACC_BUILTIN (BUILT_IN_ACC_GET_DEVICE_TYPE, "acc_get_device_type",
+		   BT_FN_INT, ATTR_NOTHROW_LIST)
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DATA_START, "GOACC_data_start",
 		   BT_FN_VOID_INT_PTR_SIZE_PTR_PTR_PTR, ATTR_NOTHROW_LIST)
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DATA_END, "GOACC_data_end",
@@ -45,3 +47,7 @@ DEF_GOACC_BUILTIN (BUILT_IN_GOACC_WAIT, "GOACC_wait",
 		   ATTR_NOTHROW_LIST)
 DEF_GOACC_BUILTIN_COMPILER (BUILT_IN_ACC_ON_DEVICE, "acc_on_device",
 			    BT_FN_INT_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_THREAD_NUM, "GOACC_get_thread_num",
+		   BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_NUM_THREADS, "GOACC_get_num_threads",
+		   BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index a04b012..101d371 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -253,6 +253,52 @@ omp_get_id (tree node)
   return IDENTIFIER_POINTER(get_identifier (temp_name));
 }
 
+/* Determine the number of threads OpenACC threads used to determine the
+   size of the array of partial reductions.  Currently, this is num_gangs
+   * vector_length.  This value may be different than GOACC_GET_NUM_THREADS,
+   because it is independed of the device used.  */
+
+static tree
+oacc_max_threads (omp_context *ctx)
+{
+  tree nthreads, vector_length, gangs, clauses;
+
+  gangs = fold_convert (sizetype, integer_one_node);
+  vector_length = gangs;
+
+  /* The reduction clause may be nested inside a loop directive.
+     Scan for the innermost vector_length clause.  */
+  for (omp_context *oc = ctx; oc; oc = oc->outer)
+    {
+      if (gimple_code (oc->stmt) != GIMPLE_OACC_PARALLEL)
+	continue;
+
+      clauses = gimple_oacc_parallel_clauses (oc->stmt);
+
+      vector_length = find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH);
+      if (vector_length)
+	vector_length = fold_convert_loc (OMP_CLAUSE_LOCATION (vector_length),
+					  sizetype,
+					  OMP_CLAUSE_VECTOR_LENGTH_EXPR
+					  (vector_length));
+      else
+	vector_length = fold_convert (sizetype, integer_one_node);
+
+      gangs = find_omp_clause (clauses, OMP_CLAUSE_NUM_GANGS);
+      if (gangs)
+        gangs = fold_convert_loc (OMP_CLAUSE_LOCATION (gangs), sizetype,
+				  OMP_CLAUSE_NUM_GANGS_EXPR (gangs));
+      else
+	gangs = fold_convert (sizetype, integer_one_node);
+
+      break;
+    }
+
+  nthreads = fold_build2 (MULT_EXPR, sizetype, gangs, vector_length);
+
+  return nthreads;
+}
+
 /* Holds a decl for __OPENMP_TARGET__.  */
 static GTY(()) tree offload_symbol_decl;
 
@@ -4429,6 +4475,57 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
     gimple_seq_add_stmt (stmt_list, gimple_build_label (label));
 }
 
+static void
+lower_reduction_var_helper (gimple_seq *stmt_seqp, omp_context *ctx, tree tid,
+			    tree var, tree new_var)
+{
+  /* The atomic add at the end of the sum creates unnecessary
+     write contention on accelerators.  To work around this,
+     create an array to store the partial reductions. Later, in
+     lower_omp_for (for openacc), the values of array will be
+     combined.  */
+
+  tree t = NULL_TREE, array, x;
+  tree type = get_base_type (var);
+  gimple stmt;
+
+  /* Now insert the partial reductions into the array.  */
+
+  /* Find the reduction array.  */
+
+  tree ptype = build_pointer_type (type);
+
+  t = lookup_reduction (omp_get_id (var), ctx);
+  t = build_receiver_ref (t, false, ctx->outer);
+
+  array = create_tmp_var (ptype, NULL);
+  gimplify_assign (array, t, stmt_seqp);
+
+  tree ptr = create_tmp_var (TREE_TYPE (array), NULL);
+
+  /* Find the reduction array.  */
+
+  /* testing a unary conversion.  */
+  tree offset = create_tmp_var (sizetype, NULL);
+  gimplify_assign (offset, TYPE_SIZE_UNIT (type),
+		   stmt_seqp);
+  t = create_tmp_var (sizetype, NULL);
+  gimplify_assign (t, unshare_expr (fold_build1 (NOP_EXPR, sizetype, tid)),
+		   stmt_seqp);
+  stmt = gimple_build_assign_with_ops (MULT_EXPR, offset, offset, t);
+  gimple_seq_add_stmt (stmt_seqp, stmt);
+
+  /* Offset expression.  Does the POINTER_PLUS_EXPR take care
+     of adding sizeof(var) to the array?  */
+  ptr = create_tmp_var (ptype, NULL);
+  stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, unshare_expr(ptr),
+				       array, offset);
+  gimple_seq_add_stmt (stmt_seqp, stmt);
+
+  /* Move the local sum to gfc$sum[i].  */
+  x = unshare_expr (build_simple_mem_ref (ptr));
+  stmt = gimplify_assign (x, new_var, stmt_seqp);
+}
 
 /* Generate code to implement the REDUCTION clauses.  */
 
@@ -4437,7 +4534,7 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
 {
   gimple_seq sub_seq = NULL;
   gimple stmt;
-  tree x, c;
+  tree x, c, tid;
   int count = 0;
 
   /* SIMD reductions are handled in lower_rec_input_clauses.  */
@@ -4462,6 +4559,17 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
   if (count == 0)
     return;
 
+  /* Initialize thread info for OpenACC.  */
+  if (is_gimple_omp_oacc_specifically (ctx->stmt))
+    {
+      /* Get the current thread id.  */
+      tree call = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+      tid = create_tmp_var (TREE_TYPE (TREE_TYPE (call)), NULL);
+      gimple stmt = gimple_build_call (call, 0);
+      gimple_call_set_lhs (stmt, tid);
+      gimple_seq_add_stmt (stmt_seqp, stmt);
+    }
+
   for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))
     {
       tree var, ref, new_var;
@@ -4498,114 +4606,8 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
 	    }
 	  else
 	    {
-	  /* The atomic add at the end of the sum creates unnecessary
-	     write contention on accelerators.  To work around this,
-	     create an array to store the partial reductions. Later, in
-	     lower_omp_for (for openacc), the values of array will be
-	     combined.  */
-
-	  tree t = NULL_TREE, array, nthreads;
-	  tree type = get_base_type (var);
-
-	  /* First ensure that the current tid is less than vector_length.  */
-	  tree exit_label = create_artificial_label (UNKNOWN_LOCATION);
-	  tree reduction_label = create_artificial_label (UNKNOWN_LOCATION);
-
-	  /* Get the current thread id.  */
-	  tree call = builtin_decl_explicit (BUILT_IN_OMP_GET_THREAD_NUM);
-	  gimple stmt = gimple_build_call (call, 1, integer_zero_node);
-	  tree fntype = gimple_call_fntype (stmt);
-	  tree tid = create_tmp_var (TREE_TYPE (fntype), NULL);
-	  gimple_call_set_lhs (stmt, tid);
-	  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-	  /* Find the total number of threads.  A reduction clause
-	     only appears inside a loop construction or a combined
-	     parallel and loop construct.  */
-	  tree c;
-
-	  if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR)
-	    c = gimple_oacc_parallel_clauses (ctx->outer->stmt);
-	  else
-	    c = gimple_oacc_parallel_clauses (ctx->stmt);
-
-	  t = find_omp_clause (c, OMP_CLAUSE_VECTOR_LENGTH);
-
-	  if (t)
-	    {
-	      t = fold_convert_loc (OMP_CLAUSE_LOCATION (t),
-				    integer_type_node,
-				    OMP_CLAUSE_VECTOR_LENGTH_EXPR (t));
-	    }
-
-	  if (!t)
-	    t = integer_one_node;
-
-	  /* Extract the number of threads.  */
-	  nthreads = create_tmp_var (sizetype, NULL);
-	  gimplify_assign (nthreads, fold_build1 (NOP_EXPR, sizetype, t),
-			   stmt_seqp);
-	  stmt = gimple_build_assign_with_ops  (MINUS_EXPR, nthreads, nthreads,
-				 fold_build1 (NOP_EXPR, sizetype,
-					      integer_one_node));
-	  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-	  /* If tid >= nthreads, goto exit_label.  */
-	  t = create_tmp_var (sizetype, NULL);
-	  gimplify_assign (t, fold_build1 (NOP_EXPR, sizetype, tid),
-			   stmt_seqp);
-	  stmt = gimple_build_cond (GT_EXPR, t, nthreads, exit_label,
-				    reduction_label);
-	  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-	  /* Place the reduction_label here.  */
-
-	  gimple_seq_add_stmt (stmt_seqp,
-			       gimple_build_label (reduction_label));
-
-	  /* Now insert the partial reductions into the array.  */
-
-	  /* Find the reduction array.  */
-
-	  tree ptype = build_pointer_type (type);
-
-	  t = lookup_reduction (omp_get_id (var), ctx);
-	  t = build_receiver_ref (t, false, ctx->outer);
-
-	  array = create_tmp_var (ptype, NULL);
-	  gimplify_assign (array, t, stmt_seqp);
-
-	  tree ptr = create_tmp_var (TREE_TYPE (array), NULL);
-
-	  /* Find the reduction array.  */
-
-	  /* testing a unary conversion.  */
-	  tree offset = create_tmp_var (sizetype, NULL);
-	  gimplify_assign (offset, TYPE_SIZE_UNIT (type),
-			   stmt_seqp);
-	  t = create_tmp_var (sizetype, NULL);
-	  gimplify_assign (t, unshare_expr (fold_build1 (NOP_EXPR, sizetype,
-							 tid)),
-			   stmt_seqp);
-	  stmt = gimple_build_assign_with_ops (MULT_EXPR, offset, offset, t);
-	  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-	  /* Offset expression.  Does the POINTER_PLUS_EXPR take care
-	     of adding sizeof(var) to the array?  */
-	  ptr = create_tmp_var (ptype, NULL);
-	  stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR,
-					       unshare_expr(ptr),
-					       array, offset);
-	  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-	  /* Move the local sum to gfc$sum[i].  */
-	  x = unshare_expr (build_simple_mem_ref (ptr));
-	  stmt = gimplify_assign (x, new_var, stmt_seqp);
-
-	  /* Place exit label here.  */
-	  gimple_seq_add_stmt (stmt_seqp, gimple_build_label (exit_label));
-
-	  return;
+	      lower_reduction_var_helper (stmt_seqp, ctx, tid, var, new_var);
+	      return;
 	    }
 	}
 
@@ -4626,12 +4628,22 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
 	}
       else
 	{
-	  x = build2 (code, TREE_TYPE (ref), ref, new_var);
-	  ref = build_outer_var_ref (var, ctx);
-	  gimplify_assign (ref, x, &sub_seq);
+	  if (is_gimple_omp_oacc_specifically (ctx->stmt))
+	    {
+	      lower_reduction_var_helper (stmt_seqp, ctx, tid, var, new_var);
+	    }
+	  else
+	    {
+	      x = build2 (code, TREE_TYPE (ref), ref, new_var);
+	      ref = build_outer_var_ref (var, ctx);
+	      gimplify_assign (ref, x, &sub_seq);
+	    }
 	}
     }
 
+  if (is_gimple_omp_oacc_specifically (ctx->stmt))
+    return;
+
   stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_GOMP_ATOMIC_START),
 			    0);
   gimple_seq_add_stmt (stmt_seqp, stmt);
@@ -7045,8 +7057,10 @@ expand_omp_for_static_nochunk (struct omp_region *region,
       threadid = build_call_expr (threadid, 0);
       break;
     case GF_OMP_FOR_KIND_OACC_LOOP:
-      nthreads = integer_one_node;
-      threadid = integer_zero_node;
+      nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
+      nthreads = build_call_expr (nthreads, 0);
+      threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+      threadid = build_call_expr (threadid, 0);
       break;
     default:
       gcc_unreachable ();
@@ -7449,8 +7463,10 @@ expand_omp_for_static_chunk (struct omp_region *region,
       threadid = build_call_expr (threadid, 0);
       break;
     case GF_OMP_FOR_KIND_OACC_LOOP:
-      nthreads = integer_one_node;
-      threadid = integer_zero_node;
+      nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
+      nthreads = build_call_expr (nthreads, 0);
+      threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+      threadid = build_call_expr (threadid, 0);
       break;
     default:
       gcc_unreachable ();
@@ -10044,11 +10060,10 @@ initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
     }
 }
 
-/* Helper function to finalize local data for the reduction arrays. The
-   reduction array needs to be reduced to the original reduction variable.
-   FIXME: This function assumes that there are vector_length threads in
-   total.  Also, it assumes that there are at least vector_length iterations
-   in the for loop.  */
+/* Helper function to process the array of partial reductions.  Nthreads
+   indicates the number of threads.  Unfortunately, GOACC_GET_NUM_THREADS
+   cannot be used here, because nthreads on the host may be different than
+   on the accelerator. */
 
 static void
 finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
@@ -10056,7 +10071,7 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
 {
   gcc_assert (is_gimple_omp_oacc_specifically (ctx->stmt));
 
-  tree c, var, array, loop_header, loop_body, loop_exit, type;
+  tree c, x, var, array, loop_header, loop_body, loop_exit, type;
   gimple stmt;
 
   /* Create for loop.
@@ -10080,8 +10095,8 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
   /* Insert the loop header label here.  */
   gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_header));
 
-  /* Loop if ix >= nthreads.  */
-  tree x = create_tmp_var (sizetype, NULL);
+  /* Exit loop if ix >= nthreads.  */
+  x = create_tmp_var (sizetype, NULL);
   gimplify_assign (x, fold_build1 (NOP_EXPR, sizetype, nthreads), stmt_seqp);
   stmt = gimple_build_cond (GE_EXPR, ix, x, loop_exit, loop_body);
   gimple_seq_add_stmt (stmt_seqp, stmt);
@@ -10123,7 +10138,6 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
       gimplify_assign (mem, build_simple_mem_ref (ptr), stmt_seqp);
 
       /* Find the original reduction variable.  */
-      tree x = build_outer_var_ref (var, ctx);
       if (is_reference (var))
 	var = build_simple_mem_ref (var);
 
@@ -10196,14 +10210,15 @@ process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
 
   for (gsi = gsi_start (*body); !gsi_end_p (gsi); gsi_next (&gsi))
     {
-      tree call;
-      tree clauses, nthreads, t, c;
+      tree clauses, nthreads, t, c, acc_device, acc_device_host, call,
+	enter, exit;
       bool reduction_found = false;
  
       stmt = gsi_stmt (gsi);
 
       switch (gimple_code (stmt))
 	{
+	  /* FIXME: A reduction may also appear in an oacc parallel.  */
 	case GIMPLE_OMP_FOR:
 	  clauses = gimple_omp_for_clauses (stmt);
 
@@ -10221,52 +10236,53 @@ process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
 	  ctx = maybe_lookup_ctx (stmt);
 	  t = NULL_TREE;
 
-	  /* The reduction clause may be nested inside a loop directive.
-	     Scan for the innermost vector_length clause.  */
-	  for (omp_context *oc = ctx; oc; oc = oc->outer)
-	    {
-	      switch (gimple_code (oc->stmt))
-		{
-		case GIMPLE_OACC_PARALLEL:
-		  c = gimple_oacc_parallel_clauses (oc->stmt);
-		  break;
-		case GIMPLE_OMP_FOR:
-		  c = gimple_omp_for_clauses (oc->stmt);
-		  break;
-		default:
-		  c = NULL_TREE;
-		  break;
-		}
-
-	      if (c && gimple_code (oc->stmt) == GIMPLE_OACC_PARALLEL)
-		{
-		  t = find_omp_clause (c, OMP_CLAUSE_VECTOR_LENGTH);
-		  if (t)
-		    t = fold_convert_loc (OMP_CLAUSE_LOCATION (t),
-					  integer_type_node,
-					  OMP_CLAUSE_VECTOR_LENGTH_EXPR (t));
-		  break;
-		}
-	    }
-
-	  if (!t)
-	    t = integer_one_node;
-
 	  /* Extract the number of threads.  */
-	  nthreads = create_tmp_var (TREE_TYPE (t), NULL);
+	  nthreads = create_tmp_var (sizetype, NULL);
+	  t = oacc_max_threads (ctx);
 	  gimplify_assign (nthreads, t, in_stmt_seqp);
 
-	  /* Ensure nthreads >= 1.  */
-	  stmt = gimple_build_assign_with_ops (MAX_EXPR, nthreads, nthreads,
-				          fold_convert(TREE_TYPE (nthreads),
-						       integer_one_node));
+	  /* Determine if this is kernel will be executed on the host.  */
+	  call = builtin_decl_explicit (BUILT_IN_ACC_GET_DEVICE_TYPE);
+	  acc_device = create_tmp_var (integer_type_node, ".acc_device_type");
+	  stmt = gimple_build_call (call, 0);
+	  gimple_call_set_lhs (stmt, acc_device);
 	  gimple_seq_add_stmt (in_stmt_seqp, stmt);
 
-	  /* Set the number of threads.  */
-	  /* FIXME: This needs to handle accelerators  */
-	  call = builtin_decl_explicit (BUILT_IN_OMP_SET_NUM_THREADS);
-	  stmt = gimple_build_call (call, 1, nthreads);
+	  /* Set nthreads = 1 for ACC_DEVICE_TYPE=host.  */
+	  acc_device_host = create_tmp_var (integer_type_node,
+					    ".acc_device_host");
+	  gimplify_assign (acc_device_host, build_int_cst (integer_type_node,
+							   2),
+			   in_stmt_seqp);
+
+	  enter = create_artificial_label (UNKNOWN_LOCATION);
+	  exit = create_artificial_label (UNKNOWN_LOCATION);
+
+	  stmt = gimple_build_cond (EQ_EXPR, acc_device, acc_device_host,
+				    enter, exit);
+	  gimple_seq_add_stmt (in_stmt_seqp, stmt);
+	  gimple_seq_add_stmt (in_stmt_seqp, gimple_build_label (enter));
+	  gimplify_assign (nthreads, fold_build1 (NOP_EXPR, sizetype,
+						  integer_one_node),
+			   in_stmt_seqp);
+	  gimple_seq_add_stmt (in_stmt_seqp, gimple_build_label (exit));
+
+	  /* Also, set nthreads = 1 for ACC_DEVICE_TYPE=host_nonshm.  */
+	  gimplify_assign (acc_device_host, build_int_cst (integer_type_node,
+							   3),
+			   in_stmt_seqp);
+
+	  enter = create_artificial_label (UNKNOWN_LOCATION);
+	  exit = create_artificial_label (UNKNOWN_LOCATION);
+
+	  stmt = gimple_build_cond (EQ_EXPR, acc_device, acc_device_host,
+				    enter, exit);
 	  gimple_seq_add_stmt (in_stmt_seqp, stmt);
+	  gimple_seq_add_stmt (in_stmt_seqp, gimple_build_label (enter));
+	  gimplify_assign (nthreads, fold_build1 (NOP_EXPR, sizetype,
+						  integer_one_node),
+			   in_stmt_seqp);
+	  gimple_seq_add_stmt (in_stmt_seqp, gimple_build_label (exit));
 
 	  initialize_reduction_data (clauses, nthreads, in_stmt_seqp, ctx);
 	  finalize_reduction_data (clauses, nthreads, out_stmt_seqp, ctx);
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index c6a88a2..21120c4 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -321,6 +321,8 @@ GOACC_2.0 {
 	GOACC_parallel;
 	GOACC_update;
 	GOACC_wait;
+	GOACC_get_thread_num;
+	GOACC_get_num_threads;
 };
 
 # FIXME: Hygiene/grouping/naming?
diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h
index 35b0627..df849e6 100644
--- a/libgomp/libgomp_g.h
+++ b/libgomp/libgomp_g.h
@@ -230,5 +230,7 @@ extern void GOACC_update (int device, const void *openmp_target, size_t mapnum,
 			  unsigned short *kinds, int async,
 			  int num_waits, ...);
 extern void GOACC_wait (int, int, ...);
+extern int GOACC_get_num_threads (void);
+extern int GOACC_get_thread_num (void);
 
 #endif /* LIBGOMP_G_H */
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 1639244..a867de0 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -117,9 +117,6 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target,
   splay_tree_key tgt_fn_key;
   void (*tgt_fn);
 
-  if (num_gangs != 1)
-    gomp_fatal ("num_gangs (%d) different from one is not yet supported",
-		num_gangs);
   if (num_workers != 1)
     gomp_fatal ("num_workers (%d) different from one is not yet supported",
 		num_workers);
@@ -389,3 +386,15 @@ GOACC_wait (int async, int num_waits, ...)
 
   va_end (ap);
 }
+
+int
+GOACC_get_num_threads (void)
+{
+  return 1;
+}
+
+int
+GOACC_get_thread_num (void)
+{
+  return 0;
+}
diff --git a/libgomp/oacc-ptx.h b/libgomp/oacc-ptx.h
new file mode 100644
index 0000000..782b6f4
--- /dev/null
+++ b/libgomp/oacc-ptx.h
@@ -0,0 +1,176 @@
+#define ABORT_PTX				\
+  ".version 3.1\n"				\
+  ".target sm_30\n"				\
+  ".address_size 64\n"				\
+  ".visible .func abort;\n"			\
+  ".visible .func abort\n"			\
+  "{\n"						\
+  "trap;\n"					\
+  "ret;\n"					\
+  "}\n"						\
+  ".visible .func _gfortran_abort;\n"		\
+  ".visible .func _gfortran_abort\n"		\
+  "{\n"						\
+  "trap;\n"					\
+  "ret;\n"					\
+  "}\n" \
+
+/* Generated with:
+
+   $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_h_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline
+*/
+#define ACC_ON_DEVICE_PTX						\
+  "        .version        3.1\n"					\
+  "        .target sm_30\n"						\
+  "        .address_size 64\n"						\
+  ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \
+  ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \
+  "{\n"									\
+  "        .reg.u32 %ar1;\n"						\
+  ".reg.u32 %retval;\n"							\
+  "        .reg.u64 %hr10;\n"						\
+  "        .reg.u32 %r24;\n"						\
+  "        .reg.u32 %r25;\n"						\
+  "        .reg.pred %r27;\n"						\
+  "        .reg.u32 %r30;\n"						\
+  "        ld.param.u32 %ar1, [%in_ar1];\n"				\
+  "                mov.u32 %r24, %ar1;\n"				\
+  "                setp.ne.u32 %r27,%r24,4;\n"				\
+  "                set.u32.eq.u32 %r30,%r24,5;\n"			\
+  "                neg.s32 %r25, %r30;\n"				\
+  "        @%r27   bra     $L3;\n"					\
+  "                mov.u32 %r25, 1;\n"					\
+  "$L3:\n"								\
+  "                mov.u32 %retval, %r25;\n"				\
+  "        st.param.u32    [%out_retval], %retval;\n"			\
+  "        ret;\n"							\
+  "        }\n"								\
+  ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1);\n" \
+  ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1)\n" \
+  "{\n"									\
+  "        .reg.u64 %ar1;\n"						\
+  ".reg.u32 %retval;\n"							\
+  "        .reg.u64 %hr10;\n"						\
+  "        .reg.u64 %r25;\n"						\
+  "        .reg.u32 %r26;\n"						\
+  "        .reg.u32 %r27;\n"						\
+  "        ld.param.u64 %ar1, [%in_ar1];\n"				\
+  "                mov.u64 %r25, %ar1;\n"				\
+  "                ld.u32  %r26, [%r25];\n"				\
+  "        {\n"								\
+  "                .param.u32 %retval_in;\n"				\
+  "        {\n"								\
+  "                .param.u32 %out_arg0;\n"				\
+  "                st.param.u32 [%out_arg0], %r26;\n"			\
+  "                call (%retval_in), acc_on_device, (%out_arg0);\n"	\
+  "        }\n"								\
+  "                ld.param.u32    %r27, [%retval_in];\n"		\
+  "}\n"									\
+  "                mov.u32 %retval, %r27;\n"				\
+  "        st.param.u32    [%out_retval], %retval;\n"			\
+  "        ret;\n"							\
+  "        }"
+
+ #define GOACC_INTERNAL_PTX						\
+  ".version 3.1\n" \
+  ".target sm_30\n" \
+  ".address_size 64\n" \
+  ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n" \
+  ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n" \
+  ".extern .func abort;\n" \
+  ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads\n"	\
+  "{\n"									\
+  ".reg .u32 %retval;\n"						\
+  ".reg .u64 %hr10;\n"							\
+  ".reg .u32 %r22;\n"							\
+  ".reg .u32 %r23;\n"							\
+  ".reg .u32 %r24;\n"							\
+  ".reg .u32 %r25;\n"							\
+  ".reg .u32 %r26;\n"							\
+  ".reg .u32 %r27;\n"							\
+  ".reg .u32 %r28;\n"							\
+  ".reg .u32 %r29;\n"							\
+  "mov.u32 %r26,0;\n"							\
+  "{\n"									\
+  ".param .u32 %retval_in;\n"						\
+  "{\n"									\
+  ".param .u32 %out_arg0;\n"						\
+  "st.param.u32 [%out_arg0],%r26;\n"					\
+  "call (%retval_in),GOACC_ntid,(%out_arg0);\n"				\
+  "}\n"									\
+  "ld.param.u32 %r27,[%retval_in];\n"					\
+  "}\n"									\
+  "mov.u32 %r22,%r27;\n"						\
+  "mov.u32 %r28,0;\n"							\
+  "{\n"									\
+  ".param .u32 %retval_in;\n"						\
+  "{\n"									\
+  ".param .u32 %out_arg0;\n"						\
+  "st.param.u32 [%out_arg0],%r28;\n"					\
+  "call (%retval_in),GOACC_nctaid,(%out_arg0);\n"			\
+  "}\n"									\
+  "ld.param.u32 %r29,[%retval_in];\n"					\
+  "}\n"									\
+  "mov.u32 %r23,%r29;\n"						\
+  "mul.lo.u32 %r24,%r22,%r23;\n"					\
+  "mov.u32 %r25,%r24;\n"						\
+  "mov.u32 %retval,%r25;\n"						\
+  "st.param.u32 [%out_retval],%retval;\n"				\
+  "ret;\n"								\
+  "}\n"									\
+  ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num\n"	\
+  "{\n"									\
+  ".reg .u32 %retval;\n"						\
+  ".reg .u64 %hr10;\n"							\
+  ".reg .u32 %r22;\n"							\
+  ".reg .u32 %r23;\n"							\
+  ".reg .u32 %r24;\n"							\
+  ".reg .u32 %r25;\n"							\
+  ".reg .u32 %r26;\n"							\
+  ".reg .u32 %r27;\n"							\
+  ".reg .u32 %r28;\n"							\
+  ".reg .u32 %r29;\n"							\
+  ".reg .u32 %r30;\n"							\
+  ".reg .u32 %r31;\n"							\
+  ".reg .u32 %r32;\n"							\
+  ".reg .u32 %r33;\n"							\
+  "mov.u32 %r28,0;\n"							\
+  "{\n"									\
+  ".param .u32 %retval_in;\n"						\
+  "{\n"									\
+  ".param .u32 %out_arg0;\n"						\
+  "st.param.u32 [%out_arg0],%r28;\n"					\
+  "call (%retval_in),GOACC_ntid,(%out_arg0);\n"				\
+  "}\n"									\
+  "ld.param.u32 %r29,[%retval_in];\n"					\
+  "}\n"									\
+  "mov.u32 %r22,%r29;\n"						\
+  "mov.u32 %r30,0;\n"							\
+  "{\n"									\
+  ".param .u32 %retval_in;\n"						\
+  "{\n"									\
+  ".param .u32 %out_arg0;\n"						\
+  "st.param.u32 [%out_arg0],%r30;\n"					\
+  "call (%retval_in),GOACC_ctaid,(%out_arg0);\n"			\
+  "}\n"									\
+  "ld.param.u32 %r31,[%retval_in];\n"					\
+  "}\n"									\
+  "mov.u32 %r23,%r31;\n"						\
+  "mul.lo.u32 %r24,%r22,%r23;\n"					\
+  "mov.u32 %r32,0;\n"							\
+  "{\n"									\
+  ".param .u32 %retval_in;\n"						\
+  "{\n"									\
+  ".param .u32 %out_arg0;\n"						\
+  "st.param.u32 [%out_arg0],%r32;\n"					\
+  "call (%retval_in),GOACC_tid,(%out_arg0);\n"				\
+  "}\n"									\
+  "ld.param.u32 %r33,[%retval_in];\n"					\
+  "}\n"									\
+  "mov.u32 %r25,%r33;\n"						\
+  "add.u32 %r26,%r24,%r25;\n"						\
+  "mov.u32 %r27,%r26;\n"						\
+  "mov.u32 %retval,%r27;\n"						\
+  "st.param.u32 [%out_retval],%retval;\n"				\
+  "ret;\n"								\
+  "}\n"
diff --git a/libgomp/plugin-nvptx.c b/libgomp/plugin-nvptx.c
index f1ca235..4db2f32 100644
--- a/libgomp/plugin-nvptx.c
+++ b/libgomp/plugin-nvptx.c
@@ -35,6 +35,7 @@
 #include "libgomp.h"
 #include "target.h"
 #include "libgomp-plugin.h"
+#include "oacc-ptx.h"
 #include "oacc-plugin.h"
 
 #include <cuda.h>
@@ -740,79 +741,6 @@ PTX_avail(void)
   return avail;
 }
 
-#define ABORT_PTX				\
-  ".version 3.1\n"				\
-  ".target sm_30\n"				\
-  ".address_size 64\n"				\
-  ".visible .func abort;\n"			\
-  ".visible .func abort\n"			\
-  "{\n"						\
-  "trap;\n"					\
-  "ret;\n"					\
-  "}\n"						\
-  ".visible .func _gfortran_abort;\n"		\
-  ".visible .func _gfortran_abort\n"		\
-  "{\n"						\
-  "trap;\n"					\
-  "ret;\n"					\
-  "}\n" \
-
-/* Generated with:
-
-   $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_h_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline
-*/
-#define ACC_ON_DEVICE_PTX						\
-  "        .version        3.1\n"					\
-  "        .target sm_30\n"						\
-  "        .address_size 64\n"						\
-  ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \
-  ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \
-  "{\n"									\
-  "        .reg.u32 %ar1;\n"						\
-  ".reg.u32 %retval;\n"							\
-  "        .reg.u64 %hr10;\n"						\
-  "        .reg.u32 %r24;\n"						\
-  "        .reg.u32 %r25;\n"						\
-  "        .reg.pred %r27;\n"						\
-  "        .reg.u32 %r30;\n"						\
-  "        ld.param.u32 %ar1, [%in_ar1];\n"				\
-  "                mov.u32 %r24, %ar1;\n"				\
-  "                setp.ne.u32 %r27,%r24,4;\n"				\
-  "                set.u32.eq.u32 %r30,%r24,5;\n"			\
-  "                neg.s32 %r25, %r30;\n"				\
-  "        @%r27   bra     $L3;\n"					\
-  "                mov.u32 %r25, 1;\n"					\
-  "$L3:\n"								\
-  "                mov.u32 %retval, %r25;\n"				\
-  "        st.param.u32    [%out_retval], %retval;\n"			\
-  "        ret;\n"							\
-  "        }\n"								\
-  ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1);\n" \
-  ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1)\n" \
-  "{\n"									\
-  "        .reg.u64 %ar1;\n"						\
-  ".reg.u32 %retval;\n"							\
-  "        .reg.u64 %hr10;\n"						\
-  "        .reg.u64 %r25;\n"						\
-  "        .reg.u32 %r26;\n"						\
-  "        .reg.u32 %r27;\n"						\
-  "        ld.param.u64 %ar1, [%in_ar1];\n"				\
-  "                mov.u64 %r25, %ar1;\n"				\
-  "                ld.u32  %r26, [%r25];\n"				\
-  "        {\n"								\
-  "                .param.u32 %retval_in;\n"				\
-  "        {\n"								\
-  "                .param.u32 %out_arg0;\n"				\
-  "                st.param.u32 [%out_arg0], %r26;\n"			\
-  "                call (%retval_in), acc_on_device, (%out_arg0);\n"	\
-  "        }\n"								\
-  "                ld.param.u32    %r27, [%retval_in];\n"		\
-  "}\n"									\
-  "                mov.u32 %retval, %r27;\n"				\
-  "        st.param.u32    [%out_retval], %retval;\n"			\
-  "        ret;\n"							\
-  "        }"
-
 static void
 link_ptx (CUmodule *module, char *ptx_code)
 {
@@ -874,6 +802,16 @@ link_ptx (CUmodule *module, char *ptx_code)
 			 cuErrorMsg (r));
     }
 
+  char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
+  r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
+		     strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+      GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
+			 cuErrorMsg (r));
+    }
+
   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code,
               strlen (ptx_code) + 1, 0, 0, 0, 0);
   if (r != CUDA_SUCCESS)
@@ -1053,7 +991,7 @@ PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 
   kargs[0] = &dp;
   r = cuLaunchKernel (function,
-			1, 1, 1,
+			num_gangs, 1, 1,
 			nthreads_in_block, 1, 1,
 			0, dev_str->stream, kargs, 0);
   if (r != CUDA_SUCCESS)
diff --git a/libgomp/testsuite/libgomp.oacc-c/reduction-1.c b/libgomp/testsuite/libgomp.oacc-c/reduction-1.c
new file mode 100644
index 0000000..acf9540
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/reduction-1.c
@@ -0,0 +1,174 @@
+/* { dg-do run } */
+
+/* Integer reductions.  */
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+#define vl 32
+
+int
+main(void)
+{
+  const int n = 1000;
+  int i;
+  int vresult, result, array[n];
+  bool lvresult, lresult;
+
+  for (i = 0; i < n; i++)
+    array[i] = i;
+
+  result = 0;
+  vresult = 0;
+
+  /* '+' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (+:result)
+  for (i = 0; i < n; i++)
+    result += array[i];
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    vresult += array[i];
+
+  if (result != vresult)
+    abort ();
+
+  result = 0;
+  vresult = 0;
+
+  /* '*' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (*:result)
+  for (i = 0; i < n; i++)
+    result *= array[i];
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    vresult *= array[i];
+
+  if (result != vresult)
+    abort ();
+
+//   result = 0;
+//   vresult = 0;
+// 
+//   /* 'max' reductions.  */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+//   for (i = 0; i < n; i++)
+//       result = result > array[i] ? result : array[i];
+// 
+//   /* Verify the reduction.  */
+//   for (i = 0; i < n; i++)
+//       vresult = vresult > array[i] ? vresult : array[i];
+// 
+//   printf("%d != %d\n", result, vresult);
+//   if (result != vresult)
+//     abort ();
+// 
+//   result = 0;
+//   vresult = 0;
+// 
+//   /* 'min' reductions.  */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+//   for (i = 0; i < n; i++)
+//       result = result < array[i] ? result : array[i];
+// 
+//   /* Verify the reduction.  */
+//   for (i = 0; i < n; i++)
+//       vresult = vresult < array[i] ? vresult : array[i];
+// 
+//   printf("%d != %d\n", result, vresult);
+//   if (result != vresult)
+//     abort ();
+
+  result = 0;
+  vresult = 0;
+
+  /* '&' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (&:result)
+  for (i = 0; i < n; i++)
+    result &= array[i];
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    vresult &= array[i];
+
+  if (result != vresult)
+    abort ();
+
+  result = 0;
+  vresult = 0;
+
+  /* '|' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (|:result)
+  for (i = 0; i < n; i++)
+    result |= array[i];
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    vresult |= array[i];
+
+  if (result != vresult)
+    abort ();
+
+  result = 0;
+  vresult = 0;
+
+  /* '^' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (^:result)
+  for (i = 0; i < n; i++)
+    result ^= array[i];
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    vresult ^= array[i];
+
+  if (result != vresult)
+    abort ();
+
+  result = 5;
+  vresult = 5;
+
+  lresult = false;
+  lvresult = false;
+
+  /* '&&' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (&&:lresult)
+  for (i = 0; i < n; i++)
+    lresult = lresult && (result > array[i]);
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    lvresult = lresult && (result > array[i]);
+
+  if (lresult != lvresult)
+    abort ();
+
+  result = 5;
+  vresult = 5;
+
+  lresult = false;
+  lvresult = false;
+
+  /* '||' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (||:lresult)
+  for (i = 0; i < n; i++)
+    lresult = lresult || (result > array[i]);
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    lvresult = lresult || (result > array[i]);
+
+  if (lresult != lvresult)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c/reduction-2.c b/libgomp/testsuite/libgomp.oacc-c/reduction-2.c
new file mode 100644
index 0000000..c2ec110
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/reduction-2.c
@@ -0,0 +1,126 @@
+/* { dg-do run } */
+
+/* float reductions.  */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+
+#define vl 32
+
+int
+main(void)
+{
+  const int n = 1000;
+  int i;
+  float vresult, result, array[n];
+  bool lvresult, lresult;
+
+  for (i = 0; i < n; i++)
+    array[i] = i;
+
+  result = 0;
+  vresult = 0;
+
+  /* '+' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (+:result)
+  for (i = 0; i < n; i++)
+    result += array[i];
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    vresult += array[i];
+
+  if (result != vresult)
+    abort ();
+
+  result = 0;
+  vresult = 0;
+
+  /* '*' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (*:result)
+  for (i = 0; i < n; i++)
+    result *= array[i];
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    vresult *= array[i];
+
+  if (fabs(result - vresult) > .0001)
+    abort ();
+//   result = 0;
+//   vresult = 0;
+// 
+//   /* 'max' reductions.  */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+//   for (i = 0; i < n; i++)
+//       result = result > array[i] ? result : array[i];
+// 
+//   /* Verify the reduction.  */
+//   for (i = 0; i < n; i++)
+//       vresult = vresult > array[i] ? vresult : array[i];
+// 
+//   printf("%d != %d\n", result, vresult);
+//   if (result != vresult)
+//     abort ();
+// 
+//   result = 0;
+//   vresult = 0;
+// 
+//   /* 'min' reductions.  */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+//   for (i = 0; i < n; i++)
+//       result = result < array[i] ? result : array[i];
+// 
+//   /* Verify the reduction.  */
+//   for (i = 0; i < n; i++)
+//       vresult = vresult < array[i] ? vresult : array[i];
+// 
+//   printf("%d != %d\n", result, vresult);
+//   if (result != vresult)
+//     abort ();
+
+  result = 5;
+  vresult = 5;
+
+  lresult = false;
+  lvresult = false;
+
+  /* '&&' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (&&:lresult)
+  for (i = 0; i < n; i++)
+    lresult = lresult && (result > array[i]);
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    lvresult = lresult && (result > array[i]);
+
+  if (lresult != lvresult)
+    abort ();
+
+  result = 5;
+  vresult = 5;
+
+  lresult = false;
+  lvresult = false;
+
+  /* '||' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (||:lresult)
+  for (i = 0; i < n; i++)
+    lresult = lresult || (result > array[i]);
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    lvresult = lresult || (result > array[i]);
+
+  if (lresult != lvresult)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c/reduction-3.c b/libgomp/testsuite/libgomp.oacc-c/reduction-3.c
new file mode 100644
index 0000000..58b49ff
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/reduction-3.c
@@ -0,0 +1,126 @@
+/* { dg-do run } */
+
+/* double reductions.  */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+
+#define vl 32
+
+int
+main(void)
+{
+  const int n = 1000;
+  int i;
+  double vresult, result, array[n];
+  bool lvresult, lresult;
+
+  for (i = 0; i < n; i++)
+    array[i] = i;
+
+  result = 0;
+  vresult = 0;
+
+  /* '+' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (+:result)
+  for (i = 0; i < n; i++)
+    result += array[i];
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    vresult += array[i];
+
+  if (result != vresult)
+    abort ();
+
+  result = 0;
+  vresult = 0;
+
+  /* '*' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (*:result)
+  for (i = 0; i < n; i++)
+    result *= array[i];
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    vresult *= array[i];
+
+  if (fabs(result - vresult) > .0001)
+    abort ();
+//   result = 0;
+//   vresult = 0;
+// 
+//   /* 'max' reductions.  */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+//   for (i = 0; i < n; i++)
+//       result = result > array[i] ? result : array[i];
+// 
+//   /* Verify the reduction.  */
+//   for (i = 0; i < n; i++)
+//       vresult = vresult > array[i] ? vresult : array[i];
+// 
+//   printf("%d != %d\n", result, vresult);
+//   if (result != vresult)
+//     abort ();
+// 
+//   result = 0;
+//   vresult = 0;
+// 
+//   /* 'min' reductions.  */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+//   for (i = 0; i < n; i++)
+//       result = result < array[i] ? result : array[i];
+// 
+//   /* Verify the reduction.  */
+//   for (i = 0; i < n; i++)
+//       vresult = vresult < array[i] ? vresult : array[i];
+// 
+//   printf("%d != %d\n", result, vresult);
+//   if (result != vresult)
+//     abort ();
+
+  result = 5;
+  vresult = 5;
+
+  lresult = false;
+  lvresult = false;
+
+  /* '&&' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (&&:lresult)
+  for (i = 0; i < n; i++)
+    lresult = lresult && (result > array[i]);
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    lvresult = lresult && (result > array[i]);
+
+  if (lresult != lvresult)
+    abort ();
+
+  result = 5;
+  vresult = 5;
+
+  lresult = false;
+  lvresult = false;
+
+  /* '||' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (||:lresult)
+  for (i = 0; i < n; i++)
+    lresult = lresult || (result > array[i]);
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    lvresult = lresult || (result > array[i]);
+
+  if (lresult != lvresult)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c/reduction-4.c b/libgomp/testsuite/libgomp.oacc-c/reduction-4.c
new file mode 100644
index 0000000..c8a9a6c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/reduction-4.c
@@ -0,0 +1,129 @@
+/* { dg-do run } */
+
+/* complex reductions.  */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+
+#define vl 32
+
+int
+main(void)
+{
+  const int n = 1000;
+  int i;
+  double complex vresult, result, array[n];
+  bool lvresult, lresult;
+
+  for (i = 0; i < n; i++)
+    array[i] = i;
+
+  result = 0;
+  vresult = 0;
+
+  /* '+' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (+:result)
+  for (i = 0; i < n; i++)
+    result += array[i];
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    vresult += array[i];
+
+  if (result != vresult)
+    abort ();
+
+  result = 0;
+  vresult = 0;
+
+  /* Needs support for complex multiplication.  */
+
+//   /* '*' reductions.  */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (*:result)
+//   for (i = 0; i < n; i++)
+//     result *= array[i];
+// 
+//   /* Verify the reduction.  */
+//   for (i = 0; i < n; i++)
+//     vresult *= array[i];
+// 
+//   if (fabs(result - vresult) > .0001)
+//     abort ();
+//   result = 0;
+//   vresult = 0;
+
+//   /* 'max' reductions.  */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+//   for (i = 0; i < n; i++)
+//       result = result > array[i] ? result : array[i];
+// 
+//   /* Verify the reduction.  */
+//   for (i = 0; i < n; i++)
+//       vresult = vresult > array[i] ? vresult : array[i];
+// 
+//   printf("%d != %d\n", result, vresult);
+//   if (result != vresult)
+//     abort ();
+// 
+//   result = 0;
+//   vresult = 0;
+// 
+//   /* 'min' reductions.  */
+// #pragma acc parallel vector_length (vl)
+// #pragma acc loop reduction (+:result)
+//   for (i = 0; i < n; i++)
+//       result = result < array[i] ? result : array[i];
+// 
+//   /* Verify the reduction.  */
+//   for (i = 0; i < n; i++)
+//       vresult = vresult < array[i] ? vresult : array[i];
+// 
+//   printf("%d != %d\n", result, vresult);
+//   if (result != vresult)
+//     abort ();
+
+  result = 5;
+  vresult = 5;
+
+  lresult = false;
+  lvresult = false;
+
+  /* '&&' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (&&:lresult)
+  for (i = 0; i < n; i++)
+    lresult = lresult && (creal(result) > creal(array[i]));
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    lvresult = lresult && (creal(result) > creal(array[i]));
+
+  if (lresult != lvresult)
+    abort ();
+
+  result = 5;
+  vresult = 5;
+
+  lresult = false;
+  lvresult = false;
+
+  /* '||' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (||:lresult)
+  for (i = 0; i < n; i++)
+    lresult = lresult || (creal(result) > creal(array[i]));
+
+  /* Verify the reduction.  */
+  for (i = 0; i < n; i++)
+    lvresult = lresult || (creal(result) > creal(array[i]));
+
+  if (lresult != lvresult)
+    abort ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c/reduction-5.c b/libgomp/testsuite/libgomp.oacc-c/reduction-5.c
new file mode 100644
index 0000000..757b8be
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/reduction-5.c
@@ -0,0 +1,32 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (void)
+{
+  int s1 = 2, s2 = 5, v1 = 2, v2 = 5;
+  int n = 100;
+  int i;
+
+#pragma acc parallel vector_length (1000)
+#pragma acc loop reduction (+:s1, s2)
+  for (i = 0; i < n; i++)
+    {
+      s1 = s1 + 3;
+      s2 = s2 + 2;
+    }
+
+  for (i = 0; i < n; i++)
+    {
+      v1 = v1 + 3;
+      v2 = v2 + 2;
+    }
+  
+  if (s1 != v1)
+    abort ();
+  
+  if (s2 != v2)
+    abort ();
+    
+  return 0;
+}
\ No newline at end of file
diff --git a/libgomp/testsuite/libgomp.oacc-c/reduction-initial-1.c b/libgomp/testsuite/libgomp.oacc-c/reduction-initial-1.c
index e763cf2..0f66a39 100644
--- a/libgomp/testsuite/libgomp.oacc-c/reduction-initial-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c/reduction-initial-1.c
@@ -1,6 +1,4 @@
 /* { dg-do run } */
-/* TODO:
-   { dg-xfail-run-if "" { *-*-* } { "-DACC_DEVICE_TYPE_host=1" } { "" } } */
 
 int
 main(void)
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-1.f90
new file mode 100644
index 0000000..3c6130e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-1.f90
@@ -0,0 +1,225 @@
+! { dg-do run }
+
+! Integer reductions
+
+program reduction_1
+  implicit none
+
+  integer, parameter    :: n = 10, vl = 2
+  integer               :: i, vresult, result
+  logical               :: lresult, lvresult
+  integer, dimension (n) :: array
+
+  do i = 1, n
+     array(i) = i
+  end do
+
+  result = 0
+  vresult = 0
+
+  ! '+' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(+:result)
+  do i = 1, n
+     result = result + array(i)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = vresult + array(i)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 0
+  vresult = 0
+
+  ! '*' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(*:result)
+  do i = 1, n
+     result = result * array(i)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = vresult * array(i)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 0
+  vresult = 0
+
+  ! 'max' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(max:result)
+  do i = 1, n
+     result = max (result, array(i))
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = max (vresult, array(i))
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 1
+  vresult = 1
+
+  ! 'min' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(min:result)
+  do i = 1, n
+     result = min (result, array(i))
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = min (vresult, array(i))
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 1
+  vresult = 1
+
+  ! 'iand' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(iand:result)
+  do i = 1, n
+     result = iand (result, array(i))
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = iand (vresult, array(i))
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 1
+  vresult = 1
+
+  ! 'ior' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(ior:result)
+  do i = 1, n
+     result = ior (result, array(i))
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = ior (vresult, array(i))
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 0
+  vresult = 0
+
+  ! 'ieor' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(ieor:result)
+  do i = 1, n
+     result = ieor (result, array(i))
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = ieor (vresult, array(i))
+  end do
+
+  if (result.ne.vresult) call abort
+
+  lresult = .false.
+  lvresult = .false.
+
+  ! '.and.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.and.:lresult)
+  do i = 1, n
+     lresult = lresult .and. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .and. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  lresult = .false.
+  lvresult = .false.
+
+  ! '.or.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.or.:lresult)
+  do i = 1, n
+     lresult = lresult .or. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .or. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  lresult = .false.
+  lvresult = .false.
+
+  ! '.eqv.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.eqv.:lresult)
+  do i = 1, n
+     lresult = lresult .eqv. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .eqv. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  lresult = .false.
+  lvresult = .false.
+
+  ! '.neqv.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.neqv.:lresult)
+  do i = 1, n
+     lresult = lresult .neqv. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .neqv. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+end program reduction_1
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-2.f90
new file mode 100644
index 0000000..241c795
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-2.f90
@@ -0,0 +1,170 @@
+! { dg-do run }
+
+! real reductions
+
+program reduction_2
+  implicit none
+
+  integer, parameter    :: n = 10, vl = 2
+  integer               :: i
+  real, parameter       :: e = .001
+  real                  :: vresult, result
+  logical               :: lresult, lvresult
+  real, dimension (n) :: array
+
+  do i = 1, n
+     array(i) = i
+  end do
+
+  result = 0
+  vresult = 0
+
+  ! '+' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(+:result)
+  do i = 1, n
+     result = result + array(i)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = vresult + array(i)
+  end do
+
+  if (abs (result - vresult) .ge. e) call abort
+
+  result = 1
+  vresult = 1
+
+  ! '*' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(*:result)
+  do i = 1, n
+     result = result * array(i)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = vresult * array(i)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 0
+  vresult = 0
+
+  ! 'max' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(max:result)
+  do i = 1, n
+     result = max (result, array(i))
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = max (vresult, array(i))
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 1
+  vresult = 1
+
+  ! 'min' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(min:result)
+  do i = 1, n
+     result = min (result, array(i))
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = min (vresult, array(i))
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 1
+  vresult = 1
+
+  ! '.and.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.and.:lresult)
+  do i = 1, n
+     lresult = lresult .and. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .and. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  lresult = .false.
+  lvresult = .false.
+
+  ! '.or.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.or.:lresult)
+  do i = 1, n
+     lresult = lresult .or. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .or. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  lresult = .false.
+  lvresult = .false.
+
+  ! '.eqv.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.eqv.:lresult)
+  do i = 1, n
+     lresult = lresult .eqv. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .eqv. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  lresult = .false.
+  lvresult = .false.
+
+  ! '.neqv.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.neqv.:lresult)
+  do i = 1, n
+     lresult = lresult .neqv. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .neqv. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+end program reduction_2
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-3.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-3.f90
new file mode 100644
index 0000000..3ae82ed
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-3.f90
@@ -0,0 +1,170 @@
+! { dg-do run }
+
+! double precision reductions
+
+program reduction_3
+  implicit none
+
+  integer, parameter    :: n = 10, vl = 2
+  integer               :: i
+  double precision, parameter :: e = .001
+  double precision      :: vresult, result
+  logical               :: lresult, lvresult
+  double precision, dimension (n) :: array
+
+  do i = 1, n
+     array(i) = i
+  end do
+
+  result = 0
+  vresult = 0
+
+  ! '+' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(+:result)
+  do i = 1, n
+     result = result + array(i)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = vresult + array(i)
+  end do
+
+  if (abs (result - vresult) .ge. e) call abort
+
+  result = 1
+  vresult = 1
+
+  ! '*' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(*:result)
+  do i = 1, n
+     result = result * array(i)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = vresult * array(i)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 0
+  vresult = 0
+
+  ! 'max' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(max:result)
+  do i = 1, n
+     result = max (result, array(i))
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = max (vresult, array(i))
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 1
+  vresult = 1
+
+  ! 'min' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(min:result)
+  do i = 1, n
+     result = min (result, array(i))
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = min (vresult, array(i))
+  end do
+
+  if (result.ne.vresult) call abort
+
+  result = 1
+  vresult = 1
+
+  ! '.and.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.and.:lresult)
+  do i = 1, n
+     lresult = lresult .and. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .and. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  lresult = .false.
+  lvresult = .false.
+
+  ! '.or.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.or.:lresult)
+  do i = 1, n
+     lresult = lresult .or. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .or. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  lresult = .false.
+  lvresult = .false.
+
+  ! '.eqv.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.eqv.:lresult)
+  do i = 1, n
+     lresult = lresult .eqv. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .eqv. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+
+  lresult = .false.
+  lvresult = .false.
+
+  ! '.neqv.' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(.neqv.:lresult)
+  do i = 1, n
+     lresult = lresult .neqv. (array(i) .ge. 5)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     lvresult = lvresult .neqv. (array(i) .ge. 5)
+  end do
+
+  if (result.ne.vresult) call abort
+end program reduction_3
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-4.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-4.f90
new file mode 100644
index 0000000..c33b0dc
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-4.f90
@@ -0,0 +1,54 @@
+! { dg-do run }
+
+! complex reductions
+
+program reduction_4
+  implicit none
+
+  integer, parameter    :: n = 10, vl = 32
+  integer               :: i
+  complex               :: vresult, result
+  complex, dimension (n) :: array
+
+  do i = 1, n
+     array(i) = i
+  end do
+
+  result = 0
+  vresult = 0
+
+  ! '+' reductions
+
+  !$acc parallel vector_length(vl) num_gangs(2)
+  !$acc loop reduction(+:result)
+  do i = 1, n
+     result = result + array(i)
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vresult = vresult + array(i)
+  end do
+
+  if (result .ne. vresult) call abort
+
+  result = 1
+  vresult = 1
+
+!  ! '*' reductions
+!
+!  !$acc parallel vector_length(vl)
+!  !$acc loop reduction(*:result)
+!  do i = 1, n
+!     result = result * array(i)
+!  end do
+!  !$acc end parallel
+!
+!  ! Verify the results
+!  do i = 1, n
+!     vresult = vresult * array(i)
+!  end do
+!
+!  if (result.ne.vresult) call abort
+end program reduction_4
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-5.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-5.f90
new file mode 100644
index 0000000..f27fb8e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-5.f90
@@ -0,0 +1,32 @@
+! { dg-do run }
+
+! subroutine reduction
+
+program reduction
+  integer, parameter    :: n = 40, c = 10
+  integer               :: i, vsum, sum
+
+  call redsub (sum, n, c)
+
+  vsum = 0
+
+  ! Verify the results
+  do i = 1, n
+     vsum = vsum + c
+  end do
+
+  if (sum.ne.vsum) call abort ()
+end program reduction
+
+subroutine redsub(sum, n, c)
+  integer :: sum, n, c
+
+  sum = 0
+
+  !$acc parallel vector_length(n) copyin (n, c) num_gangs(2)
+  !$acc loop reduction(+:sum)
+  do i = 1, n
+     sum = sum + c
+  end do
+  !$acc end parallel
+end subroutine redsub
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90
new file mode 100644
index 0000000..6325431
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90
@@ -0,0 +1,30 @@
+! { dg-do run }
+
+program reduction
+  implicit none
+
+  integer, parameter    :: n = 100
+  integer               :: i, s1, s2, vs1, vs2
+
+  s1 = 0
+  s2 = 0
+  vs1 = 0
+  vs2 = 0
+
+  !$acc parallel vector_length (1000)
+  !$acc loop reduction(+:s1, s2)
+  do i = 1, n
+     s1 = s1 + 1
+     s2 = s2 + 2
+  end do
+  !$acc end parallel
+
+  ! Verify the results
+  do i = 1, n
+     vs1 = vs1 + 1
+     vs2 = vs2 + 2
+  end do
+
+  if (s1.ne.vs1) call abort ()
+  if (s2.ne.vs2) call abort ()
+end program reduction

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]