This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[gomp4] various OpenACC/PTX built-ins and a reduction tweak


The patch introduces the following OpenACC/PTX-specific built-ins:

  * GOACC_ntid
  * GOACC_tid
  * GOACC_nctaid
  * GOACC_ctaid
  * acc_on_device
  * GOACC_get_thread_num
  * GOACC_get_num_threads

Of these functions, the only one part of the OpenACC spec is
acc_on_device. The other functions are helpers for omp-low.c. In
particular, I'm using GOACC_get_thread_num and GOACC_get_num_threads to
determine the number of accelerator threads available to the reduction
clause. Current GOACC_get_num_threads is num_gangs * vector_length, but
value is subject to change later on. It's probably a premature to
include the PTX built-ins right now, but I'd like to middle end of our
internal OpenACC branch in sync with gomp-4_0-branch.

This patch also allows OpenACC reductions to process the array holding
partial reductions on the accelerator, instead of copying that array
back to the host. Currently, this only happens when num_gangs = 1. For
PTX targets, we're going to need to use another kernel to process the
array of partial results because PTX lacks inter-CTA synchronization
(we're currently mapping gangs to CTAs). That's why I was working on the
routine clause recently.

Is this OK for gomp-4_0-branch?

Thanks,
Cesar
2014-09-16  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/
	* builtins.c (expand_builtin_acc_on_device): New function.
	(expand_oacc_builtin): New function.
	(expand_builtin): Handle BUILT_IN_GOACC_NCTAID, BUILT_IN_GOACC_CTAID,
	BUILT_IN_GOACC_NTID, BUILT_IN_GOACC_TID, BUILT_IN_GOACC_GET_THREAD_NUM
	and BUILT_IN_GOACC_GET_NUM_THREADS.
	(is_simple_builtin): Handle BUILT_IN_GOACC_NTID and BUILT_IN_GOACC_TID.
	(is_inexpensive_builtin): Handle BUILT_IN_ACC_ON_DEVICE.
	* gcc/builtins.def (DEF_GOACC_BUILTIN): Temporarily make COND always
	true.
	(DEF_GOACC_BUILTIN_COMPILER): New.

	* gcc/oacc-builtins.def (BUILT_IN_GOACC_NTID, BUILT_IN_GOACC_TID,
	BUILT_IN_GOACC_NCTAID, BUILT_IN_GOACC_CTAID, BUILT_IN_ACC_ON_DEVICE,
	BUILT_IN_GOACC_GET_THREAD_NUM, BUILT_IN_GOACC_GET_NUM_THREADS): New
	built-ins.
	* gcc/omp-low.c (finish_reduction_on_host): New function.
	(oacc_host_nthreads): New function.
	(lower_reduction_clauses): Process the array of partial reductions
	on the accelerator is num_gangs = 1.
	(expand_omp_for_static_nochunk): Use BUILT_IN_GOACC_GET_NUM_THREADS and
	BUILT_IN_GOACC_GET_THREAD_NUM for nthreads and threadid, respectively,
	with GF_OMP_FOR_KIND_OACC_LOOP.
	(expand_omp_for_static_chunk): Likewise.
	(expand_omp_target): Likewise.
	(initialize_reduction_data): Adjust memory maps for the case where
	the partial reductions are processed on the accelerator.
	(finalize_reduction_data): Handle reductions on the accelerator.
	(process_reduction_data): Likewise.


	gcc/fortran/
	* f95-lang.c (gfc_init_builtin_functions): Define
	DEF_GOACC_BUILTIN_COMPILER.
	* types.def (DEF_FUNCTION_TYPE_0): Define DEF_FUNCTION_TYPE_1 and
	DEF_FUNCTION_TYPE_3.

	gcc/testsuite/
	* c-c++-common/goacc/goacc_builtins.c: New test.


diff --git a/gcc/builtins.c b/gcc/builtins.c
index 975f696..fa1ac2d 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -5747,6 +5747,131 @@ expand_stack_save (void)
   return ret;
 }
 
+
+/* Expand OpenACC acc_on_device.
+
+   This has to happen late (that is, not in early folding; expand_builtin_*,
+   rather than fold_builtin_*), as we have to act differently for host and
+   acceleration device.  */
+
+static rtx
+expand_builtin_acc_on_device (tree exp, rtx target ATTRIBUTE_UNUSED)
+{
+  if (!validate_arglist (exp, INTEGER_TYPE, VOID_TYPE))
+    return NULL_RTX;
+
+  tree arg, v1, v2, ret;
+  location_t loc;
+
+  arg = CALL_EXPR_ARG (exp, 0);
+  arg = builtin_save_expr (arg);
+  loc = EXPR_LOCATION (exp);
+
+  /* Build: (arg == v1 || arg == v2) ? 1 : 0.  */
+
+#ifdef ACCEL_COMPILER
+  v1 = build_int_cst (TREE_TYPE (arg), /* TODO: acc_device_not_host */ 3);
+  v2 = build_int_cst (TREE_TYPE (arg), ACCEL_COMPILER_acc_device);
+#else
+  v1 = build_int_cst (TREE_TYPE (arg), /* TODO: acc_device_none */ 0);
+  v2 = build_int_cst (TREE_TYPE (arg), /* TODO: acc_device_host */ 2);
+#endif
+
+  v1 = fold_build2_loc (loc, EQ_EXPR, integer_type_node, arg, v1);
+  v2 = fold_build2_loc (loc, EQ_EXPR, integer_type_node, arg, v2);
+
+  /* Can't use TRUTH_ORIF_EXPR, as that is not supported by
+     expand_expr_real*.  */
+  ret = fold_build3_loc (loc, COND_EXPR, integer_type_node, v1, v1, v2);
+  ret = fold_build3_loc (loc, COND_EXPR, integer_type_node,
+			 ret, integer_one_node, integer_zero_node);
+
+  return expand_normal (ret);
+}
+
+
+/* Expand a thread-id/thread-count builtin for OpenACC.  */
+static rtx
+expand_oacc_builtin (enum built_in_function fcode, tree exp, rtx target)
+{
+  tree arg0 = NULL_TREE;
+  bool has_arg0 = false;
+  rtx result = const0_rtx;
+  rtx arg;
+
+  enum insn_code icode = CODE_FOR_nothing;
+  switch (fcode)
+    {
+    case BUILT_IN_GOACC_NTID:
+#ifdef HAVE_oacc_ntid
+      icode = CODE_FOR_oacc_ntid;
+#endif
+      has_arg0 = true;
+      result = const1_rtx;
+      break;
+    case BUILT_IN_GOACC_TID:
+#ifdef HAVE_oacc_ntid
+      icode = CODE_FOR_oacc_tid;
+#endif
+      has_arg0 = true;
+      break;
+    case BUILT_IN_GOACC_NCTAID:
+#ifdef HAVE_oacc_ntid
+      icode = CODE_FOR_oacc_nctaid;
+#endif
+      has_arg0 = true;
+      result = const1_rtx;
+      break;
+    case BUILT_IN_GOACC_CTAID:
+#ifdef HAVE_oacc_ntid
+      icode = CODE_FOR_oacc_ctaid;
+#endif
+      has_arg0 = true;
+      break;
+    case BUILT_IN_GOACC_GET_THREAD_NUM:
+#ifdef HAVE_oacc_threadnum
+      icode = CODE_FOR_oacc_threadnum;
+#endif
+      result = const0_rtx;
+      break;
+    case BUILT_IN_GOACC_GET_NUM_THREADS:
+#ifdef HAVE_oacc_numthreads
+      icode = CODE_FOR_oacc_numthreads;
+#endif
+      result = const1_rtx;
+      break;
+    default:
+      break;
+    }
+
+  if (has_arg0)
+    {
+      arg0 = CALL_EXPR_ARG (exp, 0);
+
+      gcc_assert (TREE_CODE (arg0) == INTEGER_CST);
+      arg = expand_normal (arg0);
+    }
+  if (icode != CODE_FOR_nothing)
+    {
+      enum machine_mode mode = insn_data[icode].operand[0].mode;
+      rtx tmp = target;
+      rtx insn;
+      if (!REG_P (tmp) || GET_MODE (tmp) != mode)
+	tmp = gen_reg_rtx (mode);
+      if (arg0)
+	insn = GEN_FCN (icode) (tmp, arg);
+      else
+	insn = GEN_FCN (icode) (tmp);
+      if (insn != NULL_RTX)
+	{
+	  emit_insn (insn);
+	  return tmp;
+        }
+    }
+
+  return result;
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient
    (and in mode MODE if that's convenient).
@@ -6816,6 +6941,20 @@ expand_builtin (tree exp, rtx target, rtx subtarget, enum machine_mode mode,
       expand_builtin_cilk_pop_frame (exp);
       return const0_rtx;
 
+    case BUILT_IN_ACC_ON_DEVICE:
+      target = expand_builtin_acc_on_device (exp, target);
+      if (target)
+	return target;
+      break;
+
+    case BUILT_IN_GOACC_NCTAID:
+    case BUILT_IN_GOACC_CTAID:
+    case BUILT_IN_GOACC_NTID:
+    case BUILT_IN_GOACC_TID:
+    case BUILT_IN_GOACC_GET_THREAD_NUM:
+    case BUILT_IN_GOACC_GET_NUM_THREADS:
+      return expand_oacc_builtin (fcode, exp, target);
+
     default:	/* just do library call, if unknown builtin */
       break;
     }
@@ -12663,6 +12802,9 @@ is_simple_builtin (tree decl)
       case BUILT_IN_EH_FILTER:
       case BUILT_IN_EH_POINTER:
       case BUILT_IN_EH_COPY_VALUES:
+	/* Just a special register access.  */
+      case BUILT_IN_GOACC_NTID:
+      case BUILT_IN_GOACC_TID:
 	return true;
 
       default:
@@ -12748,6 +12890,7 @@ is_inexpensive_builtin (tree decl)
       case BUILT_IN_LABS:
       case BUILT_IN_LLABS:
       case BUILT_IN_PREFETCH:
+      case BUILT_IN_ACC_ON_DEVICE:
 	return true;
 
       default:
diff --git a/gcc/builtins.def b/gcc/builtins.def
index 2ef896e..ef3267b 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -146,12 +146,18 @@ along with GCC; see the file COPYING3.  If not see
   DEF_BUILTIN (ENUM, NAME, BUILT_IN_NORMAL, BT_LAST, BT_LAST, false, false, \
 	       false, ATTR_LAST, false, false)
 
-/* Builtin used by the implementation of GNU OpenACC.  None of these are
-   actually implemented in the compiler; they're all in libgomp.  */
+/* Builtin used by the implementation of GNU OpenACC.  Few of these are
+   actually implemented in the compiler; most are in libgomp.  */
 #undef DEF_GOACC_BUILTIN
 #define DEF_GOACC_BUILTIN(ENUM, NAME, TYPE, ATTRS) \
   DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE,    \
-               false, true, true, ATTRS, false, flag_openacc)
+               false, true, true, ATTRS, false, \
+	       (/* TODO */ true || flag_openacc))
+#undef DEF_GOACC_BUILTIN_COMPILER
+#define DEF_GOACC_BUILTIN_COMPILER(ENUM, NAME, TYPE, ATTRS) \
+  DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE,    \
+               true, true, true, ATTRS, false, \
+	       (/* TODO */ true || flag_openacc))
 
 /* Builtin used by the implementation of GNU OpenMP.  None of these are
    actually implemented in the compiler; they're all in libgomp.  */
@@ -159,7 +165,7 @@ along with GCC; see the file COPYING3.  If not see
 #define DEF_GOMP_BUILTIN(ENUM, NAME, TYPE, ATTRS) \
   DEF_BUILTIN (ENUM, "__builtin_" NAME, BUILT_IN_NORMAL, TYPE, TYPE,    \
                false, true, true, ATTRS, false, \
-	       (flag_openmp || flag_tree_parallelize_loops))
+	       (/* TODO */ true || flag_openmp || flag_tree_parallelize_loops))
 
 /* Builtin used by implementation of Cilk Plus.  Most of these are decomposed
    by the compiler but a few are implemented in libcilkrts.  */ 
diff --git a/gcc/fortran/f95-lang.c b/gcc/fortran/f95-lang.c
index e7c64b7..85f0ed6 100644
--- a/gcc/fortran/f95-lang.c
+++ b/gcc/fortran/f95-lang.c
@@ -1093,7 +1093,11 @@ gfc_init_builtin_functions (void)
 #define DEF_GOACC_BUILTIN(code, name, type, attr) \
       gfc_define_builtin ("__builtin_" name, builtin_types[type], \
 			  code, name, attr);
+#undef DEF_GOACC_BUILTIN_COMPILER
+#define DEF_GOACC_BUILTIN_COMPILER(code, name, type, attr) \
+      gfc_define_builtin (name, builtin_types[type], code, name, attr);
 #include "../oacc-builtins.def"
+#undef DEF_GOACC_BUILTIN_COMPILER
 #undef DEF_GOACC_BUILTIN
     }
 
diff --git a/gcc/fortran/types.def b/gcc/fortran/types.def
index 59ac4c3..1dce308 100644
--- a/gcc/fortran/types.def
+++ b/gcc/fortran/types.def
@@ -82,6 +82,7 @@ DEF_FUNCTION_TYPE_0 (BT_FN_VOID, BT_VOID)
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_PTR, BT_VOID, BT_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_PTRPTR, BT_VOID, BT_PTR_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_VPTR, BT_VOID, BT_VOLATILE_PTR)
+DEF_FUNCTION_TYPE_1 (BT_FN_INT_INT, BT_INT, BT_INT)
 DEF_FUNCTION_TYPE_1 (BT_FN_UINT_UINT, BT_UINT, BT_UINT)
 DEF_FUNCTION_TYPE_1 (BT_FN_PTR_PTR, BT_PTR, BT_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_INT, BT_VOID, BT_INT)
@@ -144,6 +145,7 @@ DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I2_INT, BT_VOID, BT_VOLATILE_PTR, BT_I2, BT
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I4_INT, BT_VOID, BT_VOLATILE_PTR, BT_I4, BT_INT)
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I8_INT, BT_VOID, BT_VOLATILE_PTR, BT_I8, BT_INT)
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I16_INT, BT_VOID, BT_VOLATILE_PTR, BT_I16, BT_INT)
+DEF_FUNCTION_TYPE_3 (BT_FN_VOID_INT_PTR_INT, BT_VOID, BT_INT, BT_PTR, BT_INT)
 
 DEF_FUNCTION_TYPE_4 (BT_FN_VOID_OMPFN_PTR_UINT_UINT,
                      BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR, BT_UINT, BT_UINT)
diff --git a/gcc/oacc-builtins.def b/gcc/oacc-builtins.def
index dfb688c..909a917 100644
--- a/gcc/oacc-builtins.def
+++ b/gcc/oacc-builtins.def
@@ -39,3 +39,17 @@ DEF_GOACC_BUILTIN (BUILT_IN_GOACC_PARALLEL, "GOACC_parallel",
 		   ATTR_NOTHROW_LIST)
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_UPDATE, "GOACC_update",
 		   BT_FN_VOID_INT_PTR_SIZE_PTR_PTR_PTR, ATTR_NOTHROW_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_NTID, "GOACC_ntid",
+		   BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_TID, "GOACC_tid",
+		   BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_NCTAID, "GOACC_nctaid",
+		   BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_CTAID, "GOACC_ctaid",
+		   BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN_COMPILER (BUILT_IN_ACC_ON_DEVICE, "acc_on_device",
+			    BT_FN_INT_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_THREAD_NUM, "GOACC_get_thread_num",
+		   BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN (BUILT_IN_GOACC_GET_NUM_THREADS, "GOACC_get_num_threads",
+		   BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def
index 698dc79..08b825c 100644
--- a/gcc/omp-builtins.def
+++ b/gcc/omp-builtins.def
@@ -236,6 +236,3 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TARGET_UPDATE, "GOMP_target_update",
 		  BT_FN_VOID_INT_PTR_SIZE_PTR_PTR_PTR, ATTR_NOTHROW_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TEAMS, "GOMP_teams",
 		  BT_FN_VOID_UINT_UINT, ATTR_NOTHROW_LIST)
-
-DEF_GOMP_BUILTIN (BUILT_IN_OMP_SET_NUM_THREADS, "omp_set_num_threads",
-		  BT_FN_VOID_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 927a294..c4a6a90 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -238,6 +238,88 @@ omp_get_id (tree node)
   return IDENTIFIER_POINTER(get_identifier (temp_name));
 }
 
+/* Determines if the reduction array should be processed on the host.
+   This is done to avoid launching multiple kernels to synchronize
+   threads across PTX Cooperative Thread Arrays.  */
+static bool
+finish_reduction_on_host (omp_context *ctx)
+{
+  /* Currently, OpenACC gangs are mapped onto PTX CTAs.  Return false
+     if the num_gangs may be set to something other than one.  */
+  for (omp_context *oc = ctx; oc; oc = oc->outer)
+    {
+      tree c, t;
+      int gangs;
+
+      if (gimple_code (oc->stmt) == GIMPLE_OACC_PARALLEL)
+	{
+	  c = gimple_oacc_parallel_clauses (oc->stmt);
+	  t = find_omp_clause (c, OMP_CLAUSE_NUM_GANGS);
+	  if (t)
+	    {
+	      t = fold_convert_loc (OMP_CLAUSE_LOCATION (t),
+				    integer_type_node,
+				    OMP_CLAUSE_NUM_GANGS_EXPR (t));
+
+	      if (TREE_CODE (t) != INTEGER_CST)
+		return true;
+
+	      gangs = TREE_INT_CST_LOW (t);
+
+	      if (gangs > 1)
+		return true;
+	    }
+	  break;
+	}
+    }
+
+  return false;
+}
+
+/* Determine the number of threads OpenACC threads.  Currently, this is
+   num_gangs * vector_length.  */
+
+static tree
+oacc_host_nthreads (omp_context *ctx)
+{
+  tree nthreads, vector_length, gangs, clauses;
+
+  gangs = fold_convert (sizetype, integer_one_node);
+  vector_length = gangs;
+
+  /* The reduction clause may be nested inside a loop directive.
+     Scan for the innermost vector_length clause.  */
+  for (omp_context *oc = ctx; oc; oc = oc->outer)
+    {
+      if (gimple_code (oc->stmt) != GIMPLE_OACC_PARALLEL)
+	continue;
+
+      clauses = gimple_oacc_parallel_clauses (oc->stmt);
+
+      vector_length = find_omp_clause (clauses, OMP_CLAUSE_VECTOR_LENGTH);
+      if (vector_length)
+	vector_length = fold_convert_loc (OMP_CLAUSE_LOCATION (vector_length),
+					  sizetype,
+					  OMP_CLAUSE_VECTOR_LENGTH_EXPR
+					  (vector_length));
+      else
+	vector_length = fold_convert (sizetype, integer_one_node);
+
+      gangs = find_omp_clause (clauses, OMP_CLAUSE_NUM_GANGS);
+      if (gangs)
+        gangs = fold_convert_loc (OMP_CLAUSE_LOCATION (gangs), sizetype,
+				  OMP_CLAUSE_NUM_GANGS_EXPR (gangs));
+      else
+	gangs = fold_convert (sizetype, integer_one_node);
+
+      break;
+    }
+
+  nthreads = fold_build2 (MULT_EXPR, sizetype, gangs, vector_length);
+
+  return nthreads;
+}
+
 /* Holds a decl for __OPENMP_TARGET__.  */
 static GTY(()) tree offload_symbol_decl;
 
@@ -4356,6 +4438,10 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *stmt_list,
 }
 
 
+static void
+finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
+			 omp_context *ctx, bool receiver = false);
+
 /* Generate code to implement the REDUCTION clauses.  */
 
 static void
@@ -4433,61 +4519,26 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
 	  tree t = NULL_TREE, array, nthreads;
 	  tree type = get_base_type (var);
 
-	  /* First ensure that the current tid is less than vector_length.  */
-	  tree exit_label = create_artificial_label (UNKNOWN_LOCATION);
-	  tree reduction_label = create_artificial_label (UNKNOWN_LOCATION);
-
 	  /* Get the current thread id.  */
-	  tree call = builtin_decl_explicit (BUILT_IN_OMP_GET_THREAD_NUM);
-	  gimple stmt = gimple_build_call (call, 1, integer_zero_node);
-	  tree fntype = gimple_call_fntype (stmt);
-	  tree tid = create_tmp_var (TREE_TYPE (fntype), NULL);
+	  tree call = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+	  tree tid = create_tmp_var (TREE_TYPE (TREE_TYPE (call)), NULL);
+	  gimple stmt = gimple_build_call (call, 0);
 	  gimple_call_set_lhs (stmt, tid);
 	  gimple_seq_add_stmt (stmt_seqp, stmt);
 
 	  /* Find the total number of threads.  A reduction clause
 	     only appears inside a loop construction or a combined
 	     parallel and loop construct.  */
-	  tree c;
-
-	  if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR)
-	    c = gimple_oacc_parallel_clauses (ctx->outer->stmt);
-	  else
-	    c = gimple_oacc_parallel_clauses (ctx->stmt);
 
-	  t = find_omp_clause (c, OMP_CLAUSE_VECTOR_LENGTH);
-
-	  if (t)
-	    {
-	      t = fold_convert_loc (OMP_CLAUSE_LOCATION (t),
-				    integer_type_node,
-				    OMP_CLAUSE_VECTOR_LENGTH_EXPR (t));
-	    }
-
-	  if (!t)
-	    t = integer_one_node;
+	  call = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
+	  t = create_tmp_var (TREE_TYPE (TREE_TYPE (call)), NULL);
+	  stmt = gimple_build_call (call, 0);
+	  gimple_call_set_lhs (stmt, t);
+	  gimple_seq_add_stmt (stmt_seqp, stmt);
 
-	  /* Extract the number of threads.  */
 	  nthreads = create_tmp_var (sizetype, NULL);
 	  gimplify_assign (nthreads, fold_build1 (NOP_EXPR, sizetype, t),
 			   stmt_seqp);
-	  stmt = gimple_build_assign_with_ops  (MINUS_EXPR, nthreads, nthreads,
-				 fold_build1 (NOP_EXPR, sizetype,
-					      integer_one_node));
-	  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-	  /* If tid >= nthreads, goto exit_label.  */
-	  t = create_tmp_var (sizetype, NULL);
-	  gimplify_assign (t, fold_build1 (NOP_EXPR, sizetype, tid),
-			   stmt_seqp);
-	  stmt = gimple_build_cond (GT_EXPR, t, nthreads, exit_label,
-				    reduction_label);
-	  gimple_seq_add_stmt (stmt_seqp, stmt);
-
-	  /* Place the reduction_label here.  */
-
-	  gimple_seq_add_stmt (stmt_seqp,
-			       gimple_build_label (reduction_label));
 
 	  /* Now insert the partial reductions into the array.  */
 
@@ -4510,9 +4561,11 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
 	  gimplify_assign (offset, TYPE_SIZE_UNIT (type),
 			   stmt_seqp);
 	  t = create_tmp_var (sizetype, NULL);
-	  gimplify_assign (t, unshare_expr (fold_build1 (NOP_EXPR, sizetype,
-							 tid)),
-			   stmt_seqp);
+
+	  /* Calculate the stack offset to be array[tid+1].  */
+	  x = fold_build2 (PLUS_EXPR, sizetype, build_int_cst (sizetype, 1),
+			   fold_build1 (NOP_EXPR, sizetype, tid));
+	  gimplify_assign (t, unshare_expr (x), stmt_seqp);
 	  stmt = gimple_build_assign_with_ops (MULT_EXPR, offset, offset, t);
 	  gimple_seq_add_stmt (stmt_seqp, stmt);
 
@@ -4528,8 +4581,50 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
 	  x = unshare_expr (build_simple_mem_ref (ptr));
 	  stmt = gimplify_assign (x, new_var, stmt_seqp);
 
-	  /* Place exit label here.  */
-	  gimple_seq_add_stmt (stmt_seqp, gimple_build_label (exit_label));
+	  /* Synchronize the threads and finish up the reduction.  */
+
+	  tree next = create_artificial_label (UNKNOWN_LOCATION);
+	  tree reduction_exit = create_artificial_label (UNKNOWN_LOCATION);
+
+	  /* Synchronize all of the threads.  */
+	  call = builtin_decl_explicit (BUILT_IN_SYNC_SYNCHRONIZE);
+	  stmt = gimple_build_call (call, 0);
+	  gimple_seq_add_stmt (stmt_seqp, stmt);
+
+	  /* Jump to the exit label if tid != 0.  */
+	  tree t1 = create_tmp_var (sizetype, NULL);
+	  tree t2 = create_tmp_var (sizetype, NULL);
+	  gimplify_assign (t1, fold_build1 (NOP_EXPR, sizetype, tid),
+			   stmt_seqp);
+	  gimplify_assign (t2, fold_build1 (NOP_EXPR, sizetype,
+					    integer_zero_node),
+			   stmt_seqp);
+	  stmt = gimple_build_cond (NE_EXPR, t1, t2, reduction_exit, next);
+	  gimple_seq_add_stmt (stmt_seqp, stmt);
+	  gimple_seq_add_stmt (stmt_seqp, gimple_build_label (next));
+
+	  if (finish_reduction_on_host (ctx))
+	    {
+	      /* Set the last element of the array to be 1 if this kernel
+		 is executed on the accelerator.  */
+	      call = builtin_decl_explicit (BUILT_IN_ACC_ON_DEVICE);
+	      tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (call)), NULL);
+	      stmt = gimple_build_call (call, 1, build_int_cst
+					(integer_type_node, 2));
+	      gimple_call_set_lhs (stmt, lhs);
+	      gimple_seq_add_stmt (stmt_seqp, stmt);
+
+	      x = unshare_expr (build_simple_mem_ref (array));
+	      stmt = gimplify_assign (x, convert (TREE_TYPE (new_var),
+						  fold_build1 (TRUTH_NOT_EXPR,
+							       sizetype, lhs)),
+				      stmt_seqp);
+	    }
+	    else
+	      finalize_reduction_data (clauses, nthreads, stmt_seqp, ctx,
+				       true);
+
+	  gimple_seq_add_stmt (stmt_seqp, gimple_build_label (reduction_exit));
 
 	  return;
 	    }
@@ -5644,9 +5739,9 @@ expand_oacc_offload (struct omp_region *region)
   tree openmp_target = get_offload_symbol_decl ();
   tree fnaddr = build_fold_addr_expr (child_fn);
   g = gimple_build_call (builtin_decl_explicit (start_ix), 10, device,
-			 fnaddr, build_fold_addr_expr (openmp_target),
-			 t1, t2, t3, t4,
-			 t_num_gangs, t_num_workers, t_vector_length);
+                    fnaddr, build_fold_addr_expr (openmp_target),
+                    t1, t2, t3, t4,
+                    t_num_gangs, t_num_workers, t_vector_length);
   gimple_set_location (g, gimple_location (entry_stmt));
   gsi_insert_before (&gsi, g, GSI_SAME_STMT);
 }
@@ -6913,8 +7008,10 @@ expand_omp_for_static_nochunk (struct omp_region *region,
       threadid = build_call_expr (threadid, 0);
       break;
     case GF_OMP_FOR_KIND_OACC_LOOP:
-      nthreads = integer_one_node;
-      threadid = integer_zero_node;
+      nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
+      nthreads = build_call_expr (nthreads, 0);
+      threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+      threadid = build_call_expr (threadid, 0);
       break;
     default:
       gcc_unreachable ();
@@ -6922,6 +7019,15 @@ expand_omp_for_static_nochunk (struct omp_region *region,
   nthreads = fold_convert (itype, nthreads);
   nthreads = force_gimple_operand_gsi (&gsi, nthreads, true, NULL_TREE,
 				       true, GSI_SAME_STMT);
+
+  /* Ensure nthreads is at least 1.  BUILT_IN_GOACC_NTID returns 0 for a target
+     that does not have a specific expansion.  */
+  nthreads
+    = fold_build2 (MAX_EXPR, itype, nthreads,
+		   fold_convert (TREE_TYPE (nthreads), integer_one_node));
+  nthreads = force_gimple_operand_gsi (&gsi, nthreads, true, NULL_TREE,
+				       true, GSI_SAME_STMT);
+
   threadid = fold_convert (itype, threadid);
   threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
 				       true, GSI_SAME_STMT);
@@ -7317,8 +7423,10 @@ expand_omp_for_static_chunk (struct omp_region *region,
       threadid = build_call_expr (threadid, 0);
       break;
     case GF_OMP_FOR_KIND_OACC_LOOP:
-      nthreads = integer_one_node;
-      threadid = integer_zero_node;
+      nthreads = builtin_decl_explicit (BUILT_IN_GOACC_GET_NUM_THREADS);
+      nthreads = build_call_expr (nthreads, 0);
+      threadid = builtin_decl_explicit (BUILT_IN_GOACC_GET_THREAD_NUM);
+      threadid = build_call_expr (threadid, 0);
       break;
     default:
       gcc_unreachable ();
@@ -7326,6 +7434,15 @@ expand_omp_for_static_chunk (struct omp_region *region,
   nthreads = fold_convert (itype, nthreads);
   nthreads = force_gimple_operand_gsi (&gsi, nthreads, true, NULL_TREE,
 				       true, GSI_SAME_STMT);
+
+  /* Ensure nthreads is at least 1.  BUILT_IN_GOACC_NTID returns 0 for a target
+     that does not have a specific expansion.  */
+  nthreads
+    = fold_build2 (MAX_EXPR, itype, nthreads,
+		   fold_convert (TREE_TYPE (nthreads), integer_one_node));
+  nthreads = force_gimple_operand_gsi (&gsi, nthreads, true, NULL_TREE,
+				       true, GSI_SAME_STMT);
+
   threadid = fold_convert (itype, threadid);
   threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
 				       true, GSI_SAME_STMT);
@@ -9390,6 +9507,7 @@ expand_omp_target (struct omp_region *region)
     g = gimple_build_call (builtin_decl_explicit (start_ix), 6, device,
 			   build_fold_addr_expr (openmp_target),
 			   t1, t2, t3, t4);
+
   gimple_set_location (g, gimple_location (entry_stmt));
   gsi_insert_before (&gsi, g, GSI_SAME_STMT);
   if (kind != GF_OMP_TARGET_KIND_REGION)
@@ -9782,6 +9900,14 @@ initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
   tree (*gimple_omp_clauses) (const_gimple);
   void (*gimple_omp_set_clauses) (gimple, tree);
 
+  /* Increment nthreads by one, so the kernel can return the host type
+     in the last element of the array.  */
+  t = create_tmp_var (sizetype, NULL);
+  gimplify_assign (t, fold_build2 (PLUS_EXPR, sizetype,
+				   fold_build1 (NOP_EXPR, sizetype, nthreads),
+				   build_int_cst (sizetype, 1)), stmt_seqp);
+  nthreads = t;
+
   /* Find the innermost PARALLEL openmp context.  FIXME: OpenACC kernels
      may require extra care unless they are converted to openmp for loops.  */
 
@@ -9817,7 +9943,6 @@ initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
 			 fold_convert (TREE_TYPE (nthreads),
 				       TYPE_SIZE_UNIT (type)));
       gimple_seq_add_stmt (stmt_seqp, stmt);
-
       size = create_tmp_var (sizetype, NULL);
       gimplify_assign (size, fold_build1 (NOP_EXPR, sizetype, t), stmt_seqp);
 
@@ -9837,7 +9962,8 @@ initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
 	 most clause so that copy-out works.  */
       tree x = array;
       t = build_omp_clause (gimple_location (ctx->stmt), OMP_CLAUSE_MAP);
-      OMP_CLAUSE_MAP_KIND (t) = OMP_CLAUSE_MAP_FORCE_FROM;
+      OMP_CLAUSE_MAP_KIND (t) = finish_reduction_on_host (ctx) ?
+	OMP_CLAUSE_MAP_FORCE_FROM : OMP_CLAUSE_MAP_FORCE_ALLOC;
       OMP_CLAUSE_DECL (t) = x;
       OMP_CLAUSE_CHAIN (t) = NULL;
       if (oc)
@@ -9857,53 +9983,103 @@ initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
 
 static void
 finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
-			 omp_context *ctx)
+			 omp_context *ctx, bool receiver)
 {
   gcc_assert (is_gimple_omp_oacc_specifically (ctx->stmt));
 
-  tree c, var, array, loop_header, loop_body, loop_exit, type;
+  tree c, x, var, array, loop_header, loop_body, loop_exit, type, ptype;
   gimple stmt;
 
+  /* Update nthreads in case the reduction kernel was executed on the
+     host.  */
+  if (!receiver)
+    {
+      for (c = clauses; c && OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION;
+	   c = OMP_CLAUSE_CHAIN (c));
+
+      /* Set up reduction variable, var.  Because it's not gimple register,
+	 it needs to be treated as a reference.  */
+      var = OMP_CLAUSE_DECL (c);
+      type = get_base_type (var);
+      ptype = build_pointer_type (type);
+      if (receiver)
+	var = lookup_decl_in_outer_ctx (var, ctx);
+
+      /* Extract array[0] into mem.  */
+      array = lookup_reduction (omp_get_id (OMP_CLAUSE_DECL (c)), ctx);
+      tree mem = create_tmp_var (type, NULL);
+      gimplify_assign (mem, build_simple_mem_ref (array), stmt_seqp);
+
+      tree l1 = create_artificial_label (UNKNOWN_LOCATION);
+      tree l2 = create_artificial_label (UNKNOWN_LOCATION);
+
+      x = create_tmp_var (integer_type_node, NULL);
+      gimplify_assign (x, convert (integer_type_node, mem), stmt_seqp);
+      stmt = gimple_build_cond (EQ_EXPR, x,
+				integer_zero_node, l1, l2);
+
+      gimple_seq_add_stmt (stmt_seqp, stmt);
+      gimple_seq_add_stmt (stmt_seqp, gimple_build_label (l1));
+      gimplify_assign (nthreads, build_int_cst (sizetype, 1), stmt_seqp);
+      gimple_seq_add_stmt (stmt_seqp, gimple_build_label (l2));
+    }
+
   /* Create for loop.
 
      let var = the original reduction variable
      let array = reduction variable array
 
-     var = array[0]
-     for (i = 1; i < nthreads; i++)
+     var = array[1]
+     for (i = 2; i < nthreads; i++)
        var op= array[i]
- */
+  */
 
   loop_header = create_artificial_label (UNKNOWN_LOCATION);
   loop_body = create_artificial_label (UNKNOWN_LOCATION);
   loop_exit = create_artificial_label (UNKNOWN_LOCATION);
 
   /* Initialize the reduction variables to be value of the first array
-     element.  */
+     element.  FIXME: A parallel loop should use the original reduction
+     variable as the initial value.  */
   for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
     {
       if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION)
 	continue;
 
-      tree_code reduction_code = OMP_CLAUSE_REDUCTION_CODE (c);
-
-      /* reduction(-:var) sums up the partial results, so it acts
-	 identically to reduction(+:var).  */
-      if (reduction_code == MINUS_EXPR)
-        reduction_code = PLUS_EXPR;
-
       /* Set up reduction variable, var.  Becuase it's not gimple register,
          it needs to be treated as a reference.  */
       var = OMP_CLAUSE_DECL (c);
       type = get_base_type (var);
-      tree ptr = lookup_reduction (omp_get_id (OMP_CLAUSE_DECL (c)), ctx);
+      ptype = build_pointer_type (type);
+      if (receiver)
+	var = lookup_decl_in_outer_ctx (var, ctx);
+      array = lookup_reduction (omp_get_id (OMP_CLAUSE_DECL (c)), ctx);
 
-      /* Extract array[0] into mem.  */
+      if (receiver)
+	{
+	  tree t = create_tmp_var (ptype, NULL);
+	  array = build_receiver_ref (array, false, ctx->outer);
+	  gimplify_assign (t, array, stmt_seqp);
+	  array = t;
+	}
+
+      /* Calculate the array offset.  */
+      tree offset = create_tmp_var (sizetype, NULL);
+      gimplify_assign (offset, TYPE_SIZE_UNIT (type), stmt_seqp);
+      stmt = gimple_build_assign_with_ops (MULT_EXPR, offset, offset,
+					   build_int_cst (sizetype, 1));
+      gimple_seq_add_stmt (stmt_seqp, stmt);
+
+      tree ptr = create_tmp_var (TREE_TYPE (array), NULL);
+      stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr, array,
+					   offset);
+      gimple_seq_add_stmt (stmt_seqp, stmt);
+
+      /* Extract array[1] into mem.  */
       tree mem = create_tmp_var (type, NULL);
       gimplify_assign (mem, build_simple_mem_ref (ptr), stmt_seqp);
 
       /* Find the original reduction variable.  */
-      tree x = build_outer_var_ref (var, ctx);
       if (is_reference (var))
 	var = build_simple_mem_ref (var);
 
@@ -9913,16 +10089,15 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
 
   /* Create an index variable and set it to one.  */
   tree ix = create_tmp_var (sizetype, NULL);
-  gimplify_assign (ix, fold_build1 (NOP_EXPR, sizetype, integer_one_node),
-		   stmt_seqp);
+  gimplify_assign (ix, build_int_cst (sizetype, 2), stmt_seqp);
 
   /* Insert the loop header label here.  */
   gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_header));
 
-  /* Loop if ix >= nthreads.  */
-  tree x = create_tmp_var (sizetype, NULL);
+  /* Loop if ix < nthreads.  */
+  x = create_tmp_var (sizetype, NULL);
   gimplify_assign (x, fold_build1 (NOP_EXPR, sizetype, nthreads), stmt_seqp);
-  stmt = gimple_build_cond (GE_EXPR, ix, x, loop_exit, loop_body);
+  stmt = gimple_build_cond (GT_EXPR, ix, x, loop_exit, loop_body);
   gimple_seq_add_stmt (stmt_seqp, stmt);
 
   /* Insert the loop body label here.  */
@@ -9944,8 +10119,19 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
       /* Set up reduction variable var.  */
       var = OMP_CLAUSE_DECL (c);
       type = get_base_type (var);
+      ptype = build_pointer_type (type);
+      if (receiver)
+	var = lookup_decl_in_outer_ctx (var, ctx);
       array = lookup_reduction (omp_get_id (OMP_CLAUSE_DECL (c)), ctx);
 
+      if (receiver)
+	{
+	  tree t = create_tmp_var (ptype, NULL);
+	  array = build_receiver_ref (array, false, ctx->outer);
+	  gimplify_assign (t, array, stmt_seqp);
+	  array = t;
+	}
+
       /* Calculate the array offset.  */
       tree offset = create_tmp_var (sizetype, NULL);
       gimplify_assign (offset, TYPE_SIZE_UNIT (type), stmt_seqp);
@@ -9962,7 +10148,6 @@ finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp,
       gimplify_assign (mem, build_simple_mem_ref (ptr), stmt_seqp);
 
       /* Find the original reduction variable.  */
-      tree x = build_outer_var_ref (var, ctx);
       if (is_reference (var))
 	var = build_simple_mem_ref (var);
 
@@ -10026,7 +10211,6 @@ process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
 
   for (gsi = gsi_start (*body); !gsi_end_p (gsi); gsi_next (&gsi))
     {
-      tree call;
       tree clauses, nthreads, t, c;
       bool reduction_found = false;
  
@@ -10034,6 +10218,7 @@ process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
 
       switch (gimple_code (stmt))
 	{
+	  /* FIXME: A reduction may also appear in an oacc parallel.  */
 	case GIMPLE_OMP_FOR:
 	  clauses = gimple_omp_for_clauses (stmt);
 
@@ -10051,55 +10236,15 @@ process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp,
 	  ctx = maybe_lookup_ctx (stmt);
 	  t = NULL_TREE;
 
-	  /* The reduction clause may be nested inside a loop directive.
-	     Scan for the innermost vector_length clause.  */
-	  for (omp_context *oc = ctx; oc; oc = oc->outer)
-	    {
-	      switch (gimple_code (oc->stmt))
-		{
-		case GIMPLE_OACC_PARALLEL:
-		  c = gimple_oacc_parallel_clauses (oc->stmt);
-		  break;
-		case GIMPLE_OMP_FOR:
-		  c = gimple_omp_for_clauses (oc->stmt);
-		  break;
-		default:
-		  c = NULL_TREE;
-		  break;
-		}
-
-	      if (c && gimple_code (oc->stmt) == GIMPLE_OACC_PARALLEL)
-		{
-		  t = find_omp_clause (c, OMP_CLAUSE_VECTOR_LENGTH);
-		  if (t)
-		    t = fold_convert_loc (OMP_CLAUSE_LOCATION (t),
-					  integer_type_node,
-					  OMP_CLAUSE_VECTOR_LENGTH_EXPR (t));
-		  break;
-		}
-	    }
-
-	  if (!t)
-	    t = integer_one_node;
-
 	  /* Extract the number of threads.  */
-	  nthreads = create_tmp_var (TREE_TYPE (t), NULL);
+	  nthreads = create_tmp_var (sizetype, NULL);
+	  t = oacc_host_nthreads (ctx);
 	  gimplify_assign (nthreads, t, in_stmt_seqp);
 
-	  /* Ensure nthreads >= 1.  */
-	  stmt = gimple_build_assign_with_ops (MAX_EXPR, nthreads, nthreads,
-				          fold_convert(TREE_TYPE (nthreads),
-						       integer_one_node));
-	  gimple_seq_add_stmt (in_stmt_seqp, stmt);
-
-	  /* Set the number of threads.  */
-	  /* FIXME: This needs to handle accelerators  */
-	  call = builtin_decl_explicit (BUILT_IN_OMP_SET_NUM_THREADS);
-	  stmt = gimple_build_call (call, 1, nthreads);
-	  gimple_seq_add_stmt (in_stmt_seqp, stmt);
-
 	  initialize_reduction_data (clauses, nthreads, in_stmt_seqp, ctx);
-	  finalize_reduction_data (clauses, nthreads, out_stmt_seqp, ctx);
+
+	  if (finish_reduction_on_host (ctx))
+	    finalize_reduction_data (clauses, nthreads, out_stmt_seqp, ctx);
 	  break;
 	default:
 	  // Scan for other directives which support reduction here.
diff --git a/gcc/testsuite/c-c++-common/goacc/goacc_builtins.c b/gcc/testsuite/c-c++-common/goacc/goacc_builtins.c
new file mode 100644
index 0000000..7fa0df8
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/goacc_builtins.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+
+void
+ntid (void)
+{
+  const int ntid_x = __builtin_GOACC_ntid (0);
+  const int ntid_y = __builtin_GOACC_ntid (1);
+  const int ntid_z = __builtin_GOACC_ntid (2);
+
+  /* { dg-final { scan-assembler-not "__builtin_GOACC_ntid" } } */
+}
+
+void
+tid (void)
+{
+  const int tid_x = __builtin_GOACC_tid (0);
+  const int tid_y = __builtin_GOACC_tid (1);
+  const int tid_z = __builtin_GOACC_tid (2);
+
+  /* { dg-final { scan-assembler-not "__builtin_GOACC_tid" } } */
+}
+
+void
+nctaid (void)
+{
+  const int nctaid_x = __builtin_GOACC_nctaid (0);
+  const int nctaid_y = __builtin_GOACC_nctaid (1);
+  const int nctaid_z = __builtin_GOACC_nctaid (2);
+
+  /* { dg-final { scan-assembler-not "__builtin_GOACC_nctaid" } } */
+}
+
+void
+ctaid (void)
+{
+  const int ctaid_x = __builtin_GOACC_ctaid (0);
+  const int ctaid_y = __builtin_GOACC_ctaid (1);
+  const int ctaid_z = __builtin_GOACC_ctaid (2);
+
+  /* { dg-final { scan-assembler-not "__builtin_GOACC_ctaid" } } */
+}
+
+void
+on_device (void)
+{
+  const int on_host = __builtin_acc_on_device (0);
+  const int on_accelerator = __builtin_acc_on_device (1);
+
+  /* { dg-final { scan-assembler-not "__built_in_acc_on_device" } } */
+}
+
+void
+acc_get_thread_num (void)
+{
+  const int thread_num = __builtin_GOACC_get_thread_num ();
+
+  /* { dg-final { scan-assembler-not "__builtin_GOACC_get_thread_num" } } */
+}
+
+void
+acc_get_num_threads (void)
+{
+  const int num_threads = __builtin_GOACC_get_num_threads ();
+
+  /* { dg-final { scan-assembler-not "__builtin_GOACC_get_num_threads" } } */
+}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]