This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

OpenACC 2.5 Profiling Interface


Hi!

On Tue, 28 Feb 2017 18:43:36 +0100, I wrote:
> The 2.5 versions of the OpenACC standard added a new chapter "Profiling
> Interface".

I'd like to get that into trunk.  It's not yet complete (that is, doesn't
provide all the information specified), but it's very useful already, and
the missing pieces can later be added incrementally.

Jakub, would you please especially review the non-OpenACC-specific
changes here, including the libgomp ABI changes?

(Note that this patch doesn't apply on top of trunk.  I extracted it out
of openacc-gcc-8-branch, plus additional changes, and it depends on a
number of other pending patches.  Due to the many regions of code
touched, there are a lot of "textual" conflicts when porting it to
current trunk, but the "structure" will be the same.)

 libgomp/Makefile.am                                |  18 +-
 libgomp/acc_prof.h                                 | 235 +++++++
 libgomp/env.c                                      |   2 +
 libgomp/libgomp-plugin.c                           |   9 +
 libgomp/libgomp-plugin.h                           |   6 +
 libgomp/libgomp.map                                |  10 +
 libgomp/libgomp.texi                               | 307 ++++++++-
 libgomp/oacc-async.c                               |  97 +++
 libgomp/oacc-cuda.c                                |  72 +-
 libgomp/oacc-init.c                                | 131 +++-
 libgomp/oacc-int.h                                 |  22 +-
 libgomp/oacc-mem.c                                 | 213 +++++-
 libgomp/oacc-parallel.c                            | 427 +++++++++++-
 libgomp/oacc-plugin.c                              |  13 +
 libgomp/oacc-plugin.h                              |   3 +
 libgomp/oacc-profiling-acc_register_library.c      |  39 ++
 libgomp/oacc-profiling-locinfo.c                   | 138 ++++
 libgomp/oacc-profiling.c                           | 650 ++++++++++++++++++
 libgomp/plugin/plugin-nvptx.c                      | 144 ++++
 .../acc_prof-dispatch-1.c                          | 350 ++++++++++
 .../acc_prof-init-1-debug_info.c                   |   5 +
 .../libgomp.oacc-c-c++-common/acc_prof-init-1.c    | 388 +++++++++++
 .../libgomp.oacc-c-c++-common/acc_prof-kernels-1.c | 252 +++++++
 .../acc_prof-parallel-1-debug_info.c               |   5 +
 .../acc_prof-parallel-1.c                          | 737 +++++++++++++++++++++
 .../acc_prof-valid_bytes-1.c                       | 178 +++++
 .../libgomp.oacc-c-c++-common/acc_prof-version-1.c |  60 ++
 32 files changed, 4466 insertions(+), 81 deletions(-)

diff --git libgomp/Makefile.am libgomp/Makefile.am
index 8446b8d82c6d..eb0d58cb5103 100644
--- libgomp/Makefile.am
+++ libgomp/Makefile.am
@@ -13,11 +13,12 @@ search_path = $(addprefix $(top_srcdir)/config/, $(config_path)) $(top_srcdir) \
 fincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)$(MULTISUBDIR)/finclude
 libsubincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/include
 
+libgomp_la_LIBADD =
+
 LIBFFI = @LIBFFI@
 LIBFFIINCS = @LIBFFIINCS@
 
 if USE_LIBFFI
-libgomp_la_LIBADD = $(LIBFFI)
+libgomp_la_LIBADD += $(LIBFFI)
 endif
 
 vpath % $(strip $(search_path))
@@ -26,6 +27,13 @@ AM_CPPFLAGS = $(addprefix -I, $(search_path)) $(LIBFFIINCS)
 AM_CFLAGS = $(XCFLAGS)
 AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS)
 
+#TODO Probably have to specify dependency in the top-level build system, too.
+libgomp_la_LIBADD += ../libbacktrace/libbacktrace.la
+AM_CPPFLAGS += \
+	-I$(srcdir)/$(MULTISRCTOP)../libbacktrace \
+	-I$(MULTIBUILDTOP)../libbacktrace \
+	-I../libbacktrace
+
 toolexeclib_LTLIBRARIES = libgomp.la
 nodist_toolexeclib_HEADERS = libgomp.spec
 
@@ -70,7 +78,9 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
 	parallel.c sections.c single.c task.c team.c work.c lock.c mutex.c \
 	proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
 	splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
-	oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c
+	oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
+	oacc-profiling.c oacc-profiling-acc_register_library.c \
+	oacc-profiling-locinfo.c
 
 include $(top_srcdir)/plugin/Makefrag.am
 
@@ -79,7 +89,7 @@ libgomp_la_SOURCES += openacc.f90
 endif
 
 nodist_noinst_HEADERS = libgomp_f.h
-nodist_libsubinclude_HEADERS = omp.h openacc.h
+nodist_libsubinclude_HEADERS = acc_prof.h omp.h openacc.h
 if USE_FORTRAN
 nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
 	openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
diff --git libgomp/acc_prof.h libgomp/acc_prof.h
new file mode 100644
index 000000000000..7ae0372a3220
--- /dev/null
+++ libgomp/acc_prof.h
@@ -0,0 +1,235 @@
+/* OpenACC Runtime Library: Profiling Interface
+
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _ACC_PROF_H
+#define _ACC_PROF_H 1
+
+/* The OpenACC standard doesn't say so explicitly, but as its Profiling
+   Interface makes use of, for example, <openacc.h>'s acc_device_t, we
+   supposedly are to #include that file here.  */
+#include <openacc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* OpenACC 2.5, 5. Profiling Interface, 5.1. Events.  */
+
+typedef enum acc_event_t
+{
+  acc_ev_none = 0,
+  acc_ev_device_init_start,
+  acc_ev_device_init_end,
+  acc_ev_device_shutdown_start,
+  acc_ev_device_shutdown_end,
+  acc_ev_runtime_shutdown,
+  acc_ev_create,
+  acc_ev_delete,
+  acc_ev_alloc,
+  acc_ev_free,
+  acc_ev_enter_data_start,
+  acc_ev_enter_data_end,
+  acc_ev_exit_data_start,
+  acc_ev_exit_data_end,
+  acc_ev_update_start,
+  acc_ev_update_end,
+  acc_ev_compute_construct_start,
+  acc_ev_compute_construct_end,
+  acc_ev_enqueue_launch_start,
+  acc_ev_enqueue_launch_end,
+  acc_ev_enqueue_upload_start,
+  acc_ev_enqueue_upload_end,
+  acc_ev_enqueue_download_start,
+  acc_ev_enqueue_download_end,
+  acc_ev_wait_start,
+  acc_ev_wait_end,
+  acc_ev_last
+} acc_event_t;
+
+
+/* OpenACC 2.5, 5. Profiling Interface, 5.2. Callbacks Signature.  */
+
+/* 'In all cases, a datatype of "size_t" means a 32-bit integer for a 32-bit
+   binary and a 64-bit integer for a 64-bit binary, and a datatype "int" means
+   a 32-bit integer for both 32-bit and 64-bit binaries'.  */
+typedef long int _acc_prof_size_t;
+typedef int _acc_prof_int_t;
+
+/* Internal helpers: a struct's "valid_bytes" may be less than its "sizeof".  */
+#define _ACC_PROF_VALID_BYTES_STRUCT(_struct, _lastfield, _valid_bytes_lastfield) \
+  offsetof (_struct, _lastfield) + (_valid_bytes_lastfield)
+#if 0 /* Untested.  */
+#define _ACC_PROF_VALID_BYTES_TYPE_N(_type, _n, _valid_bytes_type) \
+  ((_n - 1) * sizeof (_type) + (_valid_bytes_type))
+#endif
+#define _ACC_PROF_VALID_BYTES_BASICTYPE(_basictype) \
+  (sizeof (_basictype))
+
+typedef struct acc_prof_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  _acc_prof_int_t version;
+  acc_device_t device_type;
+  _acc_prof_int_t device_number;
+  _acc_prof_int_t thread_id;
+  _acc_prof_size_t async;
+  _acc_prof_size_t async_queue;
+  const char *src_file;
+  const char *func_name;
+  _acc_prof_int_t line_no, end_line_no;
+  _acc_prof_int_t func_line_no, func_end_line_no;
+#define _ACC_PROF_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_prof_info, func_end_line_no, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_int_t))
+} acc_prof_info;
+
+/* We implement the OpenACC 2.5 Profiling Interface.  */
+#define _ACC_PROF_INFO_VERSION 201510
+
+typedef enum acc_construct_t
+{
+  acc_construct_parallel = 0,
+  acc_construct_kernels,
+  acc_construct_loop,
+  acc_construct_data,
+  acc_construct_enter_data,
+  acc_construct_exit_data,
+  acc_construct_host_data,
+  acc_construct_atomic,
+  acc_construct_declare,
+  acc_construct_init,
+  acc_construct_shutdown,
+  acc_construct_set,
+  acc_construct_update,
+  acc_construct_routine,
+  acc_construct_wait,
+  acc_construct_runtime_api
+} acc_construct_t;
+
+typedef struct acc_data_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+  const char *var_name;
+  _acc_prof_size_t bytes;
+  void *host_ptr;
+  void *device_ptr;
+#define _ACC_DATA_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_data_event_info, device_ptr, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_data_event_info;
+
+typedef struct acc_launch_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+  const char *kernel_name;
+  _acc_prof_size_t num_gangs, num_workers, vector_length;
+#define _ACC_LAUNCH_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_launch_event_info, vector_length, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_size_t))
+} acc_launch_event_info;
+
+typedef struct acc_other_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+#define _ACC_OTHER_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_other_event_info, tool_info, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_other_event_info;
+
+typedef union acc_event_info
+{
+  acc_event_t event_type;
+  acc_data_event_info data_event;
+  acc_launch_event_info launch_event;
+  acc_other_event_info other_event;
+} acc_event_info;
+
+typedef enum acc_device_api
+{
+  acc_device_api_none = 0,
+  acc_device_api_cuda,
+  acc_device_api_opencl,
+  acc_device_api_coi,
+  acc_device_api_other
+} acc_device_api;
+
+typedef struct acc_api_info
+{
+  acc_device_api device_api;
+  _acc_prof_int_t valid_bytes;
+  acc_device_t device_type;
+  _acc_prof_int_t vendor;
+  void *device_handle;
+  void *context_handle;
+  void *async_handle;
+#define _ACC_API_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_api_info, async_handle, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_api_info;
+
+typedef void (*acc_prof_callback) (acc_prof_info *, acc_event_info *,
+				   acc_api_info *);
+
+
+/* OpenACC 2.5, 5. Profiling Interface, 5.3. Loading the Library.  */
+
+typedef enum acc_register_t
+{
+  acc_reg = 0,
+  acc_toggle = 1,
+  acc_toggle_per_thread = 2
+} acc_register_t;
+
+typedef void (*acc_prof_reg) (acc_event_t, acc_prof_callback, acc_register_t);
+extern void acc_prof_register (acc_event_t, acc_prof_callback, acc_register_t) __GOACC_NOTHROW;
+extern void acc_prof_unregister (acc_event_t, acc_prof_callback, acc_register_t) __GOACC_NOTHROW;
+typedef void (*acc_query_fn) ();
+typedef acc_query_fn (*acc_prof_lookup_func) (const char *);
+extern acc_query_fn acc_prof_lookup (const char *) __GOACC_NOTHROW;
+/* Don't tag "acc_register_library" as "__GOACC_NOTHROW": this function can be
+   overridden by the application, and must be expected to do "everything".  */
+extern void acc_register_library (acc_prof_reg, acc_prof_reg, acc_prof_lookup_func);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ACC_PROF_H */
diff --git libgomp/env.c libgomp/env.c
index 871a3e4cb40f..c99ba85fc88f 100644
--- libgomp/env.c
+++ libgomp/env.c
@@ -1338,5 +1338,7 @@ initialize_env (void)
   parse_acc_device_type ();
 
   goacc_runtime_initialize ();
+
+  goacc_profiling_initialize ();
 }
 #endif /* LIBGOMP_OFFLOADED_ONLY */
diff --git libgomp/libgomp-plugin.c libgomp/libgomp-plugin.c
index d67b458f714b..7cb123c2a998 100644
--- libgomp/libgomp-plugin.c
+++ libgomp/libgomp-plugin.c
@@ -29,6 +29,7 @@
 #include <stdlib.h>
 
 #include "libgomp.h"
+#include "oacc-int.h"
 #include "libgomp-plugin.h"
 
 void *
@@ -78,3 +79,11 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
   gomp_vfatal (msg, ap);
   va_end (ap);
 }
+
+void
+GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *prof_info,
+				      acc_event_info *event_info,
+				      acc_api_info *api_info)
+{
+  goacc_profiling_dispatch (prof_info, event_info, api_info);
+}
diff --git libgomp/libgomp-plugin.h libgomp/libgomp-plugin.h
index 286f7a65d326..42b673b7b43f 100644
--- libgomp/libgomp-plugin.h
+++ libgomp/libgomp-plugin.h
@@ -33,6 +33,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include "acc_prof.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -88,6 +90,10 @@ extern void GOMP_PLUGIN_error (const char *, ...)
 extern void GOMP_PLUGIN_fatal (const char *, ...)
 	__attribute__ ((noreturn, format (printf, 1, 2)));
 
+extern void GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *,
+						  acc_event_info *,
+						  acc_api_info *);
+
 /* Prototypes for functions implemented by libgomp plugins.  */
 extern const char *GOMP_OFFLOAD_get_name (void);
 extern unsigned int GOMP_OFFLOAD_get_caps (void);
diff --git libgomp/libgomp.map libgomp/libgomp.map
index a48393b3771a..78c7f292f6ee 100644
--- libgomp/libgomp.map
+++ libgomp/libgomp.map
@@ -424,6 +424,10 @@ OACC_2.5 {
 	acc_get_default_async_h_;
 	acc_memcpy_from_device_async;
 	acc_memcpy_to_device_async;
+	acc_prof_lookup;
+	acc_prof_register;
+	acc_prof_unregister;
+	acc_register_library;
 	acc_set_default_async;
 	acc_set_default_async_h_;
 	acc_update_device_async;
@@ -482,3 +486,9 @@ GOMP_PLUGIN_1.2 {
   global:
 	GOMP_PLUGIN_acc_thread_default_async;
 } GOMP_PLUGIN_1.1;
+
+GOMP_PLUGIN_1.3 {
+  global:
+	GOMP_PLUGIN_goacc_profiling_dispatch;
+	GOMP_PLUGIN_goacc_thread;
+} GOMP_PLUGIN_1.2;
diff --git libgomp/libgomp.texi libgomp/libgomp.texi
index 7a19d382cfc1..5bee6d2f316e 100644
--- libgomp/libgomp.texi
+++ libgomp/libgomp.texi
@@ -111,6 +111,7 @@ changed to GNU Offloading and Multi Processing Runtime Library.
                                asynchronous operations.
 * OpenACC Library Interoperability:: OpenACC library interoperability with the
                                NVIDIA CUBLAS library.
+* OpenACC Profiling Interface::
 * The libgomp ABI::            Notes on the external ABI presented by libgomp.
 * Reporting Bugs::             How to report bugs in the GNU Offloading and
                                Multi Processing Runtime Library.
@@ -2843,13 +2844,15 @@ A.2.1.4.
 @node OpenACC Environment Variables
 @chapter OpenACC Environment Variables
 
-The variables @env{ACC_DEVICE_TYPE} and @env{ACC_DEVICE_NUM}
+The variables @env{ACC_DEVICE_TYPE}, @env{ACC_DEVICE_NUM},
+and @code{ACC_PROFLIB}
 are defined by section 4 of the OpenACC specification in version 2.5.
 The variable @env{GCC_ACC_NOTIFY} is used for diagnostic purposes.
 
 @menu
 * ACC_DEVICE_TYPE::
 * ACC_DEVICE_NUM::
+* ACC_PROFLIB::
 * GCC_ACC_NOTIFY::
 @end menu
 
@@ -2875,6 +2878,19 @@ The variable @env{GCC_ACC_NOTIFY} is used for diagnostic purposes.
 
 
 
+@node ACC_PROFLIB
+@section @code{ACC_PROFLIB}
+@table @asis
+@item @emph{See also}:
+@ref{OpenACC Profiling Interface}
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.5}, section
+4.3.
+@end table
+
+
+
 @node GCC_ACC_NOTIFY
 @section @code{GCC_ACC_NOTIFY}
 @table @asis
@@ -3090,6 +3106,295 @@ Application Programming Interface}, version 2.5.}
 
 
 
+@c ---------------------------------------------------------------------
+@c OpenACC Profiling Interface
+@c ---------------------------------------------------------------------
+
+@node OpenACC Profiling Interface
+@chapter OpenACC Profiling Interface
+
+@section Implementation Status and Implementation-Defined Behavior
+
+We're implementing most of the Profiling Interface as defined by
+the OpenACC 2.5 specification.  The specification doesn't
+clearly define some aspects of its Profiling Interface, so we're
+clarifying these as @emph{implementation-defined behavior} here.  We
+already have reported to the OpenACC Technical Committee some issues,
+and will report more, later on.
+
+This implementation of the OpenACC Profiling Interface is tuned to
+keep the performance impact as low as possible when it's not in use.
+This is relevant, as the Profiling Interface affects all the
+@emph{hot} code paths (in the target code, not in the offloaded code).
+Users of the OpenACC Profiling Interface can be expected to understand
+that performance will always be impacted to some degree: for example,
+because of the @emph{runtime} (libgomp) calling into a third-party
+@emph{library} for every event that has been registered.
+
+This implementation of the OpenACC Profiling Interface has not yet
+been validated for use in multi-threaded code.  This is a more general
+issue; see CSTS-110 @cite{Make sure all OpenACC entry points in
+libgomp are thread-safe}.
+
+The @code{acc_prof_lookup} interface is not implemented, and
+@code{acc_register_library} will receive @code{NULL} for its
+@code{lookup} parameter.
+
+Remarks about data provided to callbacks:
+
+@table @asis
+
+@item @code{acc_prof_info.event_type}
+It is not clear if for @emph{nested} event callbacks (for example,
+@code{acc_ev_enqueue_launch_start} as part of a parent compute
+construct), this should be set for the nested event
+(@code{acc_ev_enqueue_launch_start}), or if the value of the parent
+construct should remain (@code{acc_ev_compute_construct_start}).  In
+this implementation, the value will generally correspond to the
+innermost nested event type.
+
+@item @code{acc_prof_info.device_type}
+@itemize
+
+@item
+For @code{acc_ev_compute_construct_start}, and in presence of an
+@code{if} clause with @emph{false} argument, this will still refer to
+the offloading device type; unsure whether that's the expected
+behavior.
+
+@item
+Complementary to the item before, for
+@code{acc_ev_compute_construct_end}, this is set to
+@code{acc_device_host} in presence of an @code{if} clause with
+@emph{false} argument, unsure whether that's the expected behavior.
+
+@end itemize
+
+@item @code{acc_prof_info.thread_id}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.async}
+@itemize
+
+@item
+Not yet implemented correctly for
+@code{acc_ev_compute_construct_start}.
+
+@item
+In a compute construct, for host-fallback
+execution/@code{acc_device_host} it will always be
+@code{acc_async_sync}; unsure if that is the expected behavior.
+
+@item
+For @code{acc_ev_device_init_start} and @code{acc_ev_device_init_end},
+it will always be @code{acc_async_sync}; unsure if that is the
+expected behavior.
+
+@end itemize
+
+@item @code{acc_prof_info.async_queue}
+There is no @cite{limited number of asynchronous queues} in libgomp.
+We define this to always have the same value as
+@code{acc_prof_info.async}.
+
+@item @code{acc_prof_info.src_file}, @code{acc_prof_info.func_name}, @code{acc_prof_info.line_no}
+If libbacktrace is available and functional (that is, @code{-g} debug
+information is available), these will be set accordingly for a lot of
+event types.  Otherwise, these will be set to @code{NULL}
+(@code{acc_prof_info.src_file}, @code{acc_prof_info.func_name}), or
+@code{-1} (@code{acc_prof_info.line_no}), respectively.
+
+@item @code{acc_prof_info.end_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.func_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.func_end_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_event_info.event_type}, @code{acc_event_info.*.event_type}
+Relating to @code{acc_prof_info.event_type} discussed above, in this
+implementation, this will always be the same value as
+@code{acc_prof_info.event_type}.
+
+@item @code{acc_event_info.*.parent_construct}
+@itemize
+
+@item
+Will be @code{acc_construct_parallel} for OpenACC kernels constructs;
+should be @code{acc_construct_kernels}.
+
+@item
+Will be @code{acc_construct_enter_data} or
+@code{acc_construct_exit_data} when processing variable mappings
+specified in OpenACC declare directives; should be
+@code{acc_construct_declare}.
+
+@item
+For implicit @code{acc_ev_device_init_start},
+@code{acc_ev_device_init_end}, and explicit as well as implicit
+@code{acc_ev_alloc}, @code{acc_ev_free},
+@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
+@code{acc_ev_enqueue_download_start},
+@code{acc_ev_enqueue_download_end}, @code{acc_ev_wait_start}, and
+@code{acc_ev_wait_end}, will be
+@code{acc_construct_parallel}; should reflect the real parent
+construct.
+
+@end itemize
+
+@item @code{acc_event_info.*.implicit}
+For @code{acc_ev_alloc}, @code{acc_ev_free},
+@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
+@code{acc_ev_enqueue_download_start},
+@code{acc_ev_enqueue_download_end}, @code{acc_ev_wait_start}, and
+@code{acc_ev_wait_end}, this currently will be @code{1}
+also for explicit usage.
+
+@item @code{acc_event_info.data_event.var_name}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_event_info.data_event.host_ptr}
+For @code{acc_ev_alloc}, and @code{acc_ev_free}, this is always
+@code{NULL}.
+
+@item @code{typedef union acc_api_info}
+@dots{} as printed in @cite{5.2.3. Third Argument: API-Specific
+Information}, should obviously be @code{typedef @emph{struct}
+acc_api_info}.
+
+@item @code{acc_api_info.device_api}
+Possibly not yet implemented correctly for
+@code{acc_ev_compute_construct_start},
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}:
+will always be @code{acc_device_api_none} for these event types.
+For @code{acc_ev_enter_data_start}, it will be
+@code{acc_device_api_none} in some cases.
+
+@item @code{acc_api_info.device_type}
+Always the same as @code{acc_prof_info.device_type}.
+
+@item @code{acc_api_info.vendor}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_api_info.device_handle}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_api_info.context_handle}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_api_info.async_handle}
+Always @code{NULL}; not yet implemented.
+
+@end table
+
+Remarks about certain event types:
+
+@table @asis
+
+@item @code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+@itemize
+
+@item
+@c See DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT in
+@c libgomp.oacc-c-c++-common/acc_prof-parallel-1.c.
+Whan a compute construct triggers implicit
+@code{acc_ev_device_init_start} and @code{acc_ev_device_init_end}
+events, they currently aren't @emph{nested within} the corresponding
+@code{acc_ev_compute_construct_start} and
+@code{acc_ev_compute_construct_end}, but they're currently observed
+@emph{before} @code{acc_ev_compute_construct_start}.  It is not clear
+what to do: the standard asks us provide a lot of details to the
+@code{acc_ev_compute_construct_start} callback, without (implicitly)
+initializing a device before?
+
+@item
+Callbacks for these event types will not be invoked for calls to the
+@code{acc_set_device_type} and @code{acc_set_device_num} functions;
+it's not clear if they should be.
+
+@end itemize
+
+@item @code{acc_ev_enter_data_start}, @code{acc_ev_enter_data_end}, @code{acc_ev_exit_data_start}, @code{acc_ev_exit_data_end}
+@itemize
+
+@item
+Callbacks for these event types will also be invoked for OpenACC
+host_data constructs; it's not clear if they should be.
+
+@item
+Callbacks for these event types will also be invoked when processing
+variable mappings specified in OpenACC declare directives; it's not
+clear if they should be.
+
+@end itemize
+
+@end table
+
+Callbacks for the following event types will be invoked, but dispatch
+and information provided therein has not yet been thoroughly reviewed:
+
+@itemize
+@item @code{acc_ev_alloc}
+@item @code{acc_ev_free}
+@item @code{acc_ev_update_start}, @code{acc_ev_update_end}
+@item @code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end}
+@item @code{acc_ev_enqueue_download_start}, @code{acc_ev_enqueue_download_end}
+@item @code{acc_ev_wait_start}, @code{acc_ev_wait_end}
+@end itemize
+
+During device initialization, and finalization, respectively,
+callbacks for the following event types will not yet be invoked:
+
+@itemize
+@item @code{acc_ev_alloc}
+@item @code{acc_ev_free}
+@end itemize
+
+Callbacks for the following event types have not yet been implemented,
+so currently won't be invoked:
+
+@itemize
+@item @code{acc_ev_device_shutdown_start}, @code{acc_ev_device_shutdown_end}
+@item @code{acc_ev_runtime_shutdown}
+@item @code{acc_ev_create}, @code{acc_ev_delete}
+@end itemize
+
+For the following runtime library functions, not all expected
+callbacks will be invoked (mostly concerning implicit device
+initialization):
+
+@itemize
+@item @code{acc_get_num_devices}
+@item @code{acc_set_device_type}
+@item @code{acc_get_device_type}
+@item @code{acc_set_device_num}
+@item @code{acc_get_device_num}
+@item @code{acc_init}
+@item @code{acc_shutdown}
+@end itemize
+
+Aside from implicit device initialization, for the following runtime
+library functions, no callbacks will be invoked for shared-memory
+offloading devices (it's not clear if they should be):
+
+@itemize
+@item @code{acc_malloc}
+@item @code{acc_free}
+@item @code{acc_copyin}, @code{acc_present_or_copyin}, @code{acc_copyin_async}
+@item @code{acc_create}, @code{acc_present_or_create}, @code{acc_create_async}
+@item @code{acc_copyout}, @code{acc_copyout_async}, @code{acc_copyout_finalize}, @code{acc_copyout_finalize_async}
+@item @code{acc_delete}, @code{acc_delete_async}, @code{acc_delete_finalize}, @code{acc_delete_finalize_async}
+@item @code{acc_update_device}, @code{acc_update_device_async}
+@item @code{acc_update_self}, @code{acc_update_self_async}
+@item @code{acc_map_data}, @code{acc_unmap_data}
+@item @code{acc_memcpy_to_device}, @code{acc_memcpy_to_device_async}
+@item @code{acc_memcpy_from_device}, @code{acc_memcpy_from_device_async}
+@end itemize
+
+
+
 @c ---------------------------------------------------------------------
 @c The libgomp ABI
 @c ---------------------------------------------------------------------
diff --git libgomp/oacc-async.c libgomp/oacc-async.c
index 6ef7115fa355..f651ee15a3c0 100644
--- libgomp/oacc-async.c
+++ libgomp/oacc-async.c
@@ -117,9 +117,26 @@ acc_async_test (int async)
   if (!thr || !thr->dev)
     gomp_fatal ("no device active");
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+  if (profiling_setup_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
   int res = thr->dev->openacc.async.test_func (aq);
 
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
   return res;
 }
 
@@ -130,6 +147,12 @@ acc_async_test_all (void)
   if (!thr || !thr->dev)
     gomp_fatal ("no device active");
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+
   int ret = 1;
   gomp_mutex_lock (&thr->dev->openacc.async.lock);
   for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
@@ -140,6 +163,11 @@ acc_async_test_all (void)
       }
   gomp_mutex_unlock (&thr->dev->openacc.async.lock);
 
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
   return ret;
 }
 
@@ -151,11 +179,28 @@ acc_wait (int async)
 
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+  if (profiling_setup_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   if (!thr || !thr->dev)
     gomp_fatal ("no device active");
 
   goacc_aq aq = lookup_goacc_asyncqueue (thr, true, async);
   thr->dev->openacc.async.synchronize_func (aq);
+
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 /* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait.  */
@@ -174,6 +219,17 @@ acc_wait_async (int async1, int async2)
 {
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+  if (profiling_setup_p)
+    {
+      prof_info.async = async2;
+      prof_info.async_queue = prof_info.async;
+    }
+
   if (!thr || !thr->dev)
     gomp_fatal ("no device active");
 
@@ -186,6 +242,12 @@ acc_wait_async (int async1, int async2)
 
   thr->dev->openacc.async.synchronize_func (aq1);
   thr->dev->openacc.async.serialize_func (aq1, aq2);
+
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -193,6 +255,12 @@ acc_wait_all (void)
 {
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+
   if (!thr || !thr->dev)
     gomp_fatal ("no device active");
 
@@ -202,6 +270,12 @@ acc_wait_all (void)
   for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
     dev->openacc.async.synchronize_func (l->aq);
   gomp_mutex_unlock (&dev->openacc.async.lock);
+
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 /* acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all.  */
@@ -223,6 +297,17 @@ acc_wait_all_async (int async)
 
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+  if (profiling_setup_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   if (!thr || !thr->dev)
     gomp_fatal ("no device active");
 
@@ -236,11 +321,20 @@ acc_wait_all_async (int async)
 	thr->dev->openacc.async.serialize_func (l->aq, waiting_queue);
     }
   gomp_mutex_unlock (&thr->dev->openacc.async.lock);
+
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 int
 acc_get_default_async (void)
 {
+  /* In the following, no OpenACC Profiling Interface events can possibly be
+     generated.  */
+
   struct goacc_thread *thr = goacc_thread ();
 
   if (!thr || !thr->dev)
@@ -252,6 +346,9 @@ acc_get_default_async (void)
 void
 acc_set_default_async (int async)
 {
+  /* In the following, no OpenACC Profiling Interface events can possibly be
+     generated.  */
+
   if (async < acc_async_sync)
     gomp_fatal ("invalid async argument: %d", async);
 
diff --git libgomp/oacc-cuda.c libgomp/oacc-cuda.c
index cde51b768a10..7e902894c591 100644
--- libgomp/oacc-cuda.c
+++ libgomp/oacc-cuda.c
@@ -36,10 +36,23 @@ acc_get_current_cuda_device (void)
 {
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+
+  void *ret = NULL;
   if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
-    return thr->dev->openacc.cuda.get_current_device_func ();
+    ret = thr->dev->openacc.cuda.get_current_device_func ();
 
-  return NULL;
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
+  return ret;
 }
 
 void *
@@ -47,10 +60,23 @@ acc_get_current_cuda_context (void)
 {
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+
+  void *ret = NULL;
   if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
-    return thr->dev->openacc.cuda.get_current_context_func ();
- 
-  return NULL;
+    ret = thr->dev->openacc.cuda.get_current_context_func ();
+
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
+  return ret;
 }
 
 void *
@@ -61,6 +87,17 @@ acc_get_cuda_stream (int async)
   if (async < 0)
     return NULL;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+  if (profiling_setup_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   void *ret = NULL;
   if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
     {
@@ -69,6 +106,12 @@ acc_get_cuda_stream (int async)
 	ret = thr->dev->openacc.cuda.get_stream_func (aq);
     }
 
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
   return ret;
 }
 
@@ -80,10 +123,21 @@ acc_set_cuda_stream (int async, void *stream)
   if (async < 0 || stream == NULL)
     return 0;
 
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (1);
 
   thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+  if (profiling_setup_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   int ret = -1;
   if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
     {
@@ -93,5 +147,11 @@ acc_set_cuda_stream (int async, void *stream)
       gomp_mutex_unlock (&thr->dev->openacc.async.lock);
     }
 
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
   return ret;
 }
diff --git libgomp/oacc-init.c libgomp/oacc-init.c
index c09ba35a6fe5..05559accb0db 100644
--- libgomp/oacc-init.c
+++ libgomp/oacc-init.c
@@ -230,13 +230,68 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
    held before calling this function.  */
 
 static struct gomp_device_descr *
-acc_init_1 (acc_device_t d)
+acc_init_1 (struct goacc_thread *thr, acc_device_t d,
+	    acc_construct_t parent_construct, int implicit,
+	    int acc_prof_locinfo_skip)
 {
   gomp_mutex_lock (&acc_init_state_lock);
   acc_init_state = initializing;
   acc_init_thread = pthread_self ();
   gomp_mutex_unlock (&acc_init_state_lock);
 
+  bool check_not_nested_p;
+  if (implicit)
+    {
+      /* In the implicit case, there should (must?) already be something
+	 have been set up for an outer construct.  */
+      check_not_nested_p = false;
+    }
+  else
+    {
+      check_not_nested_p = true;
+    }
+  bool profiling_dispatch_p
+    = __builtin_expect (goacc_profiling_dispatch_p (check_not_nested_p),
+			false);
+
+  acc_prof_info prof_info;
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_device_init_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = d;
+      prof_info.device_number = goacc_device_num;
+      prof_info.thread_id = -1;
+      prof_info.async = acc_async_sync;
+      prof_info.async_queue = prof_info.async;
+      goacc_profiling_locinfo_fill (thr, &prof_info, acc_prof_locinfo_skip + 1);
+    }
+  acc_event_info device_init_event_info;
+  if (profiling_dispatch_p)
+    {
+      device_init_event_info.other_event.event_type = prof_info.event_type;
+      device_init_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      device_init_event_info.other_event.parent_construct = parent_construct;
+      device_init_event_info.other_event.implicit = implicit;
+      device_init_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_dispatch_p)
+    {
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_dispatch_p)
+    goacc_profiling_dispatch (&prof_info, &device_init_event_info, &api_info);
+
   struct gomp_device_descr *base_dev, *acc_dev;
   int ndevs;
 
@@ -259,6 +314,14 @@ acc_init_1 (acc_device_t d)
   gomp_init_device (acc_dev);
   gomp_mutex_unlock (&acc_dev->lock);
 
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_device_init_end;
+      device_init_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &device_init_event_info,
+				&api_info);
+    }
+
   gomp_mutex_lock (&acc_init_state_lock);
   acc_init_state = initialized;
   gomp_mutex_unlock (&acc_init_state_lock);
@@ -454,7 +517,12 @@ goacc_attach_host_thread_to_device (int ord)
   thr->dev = acc_dev = &base_dev[ord];
   thr->saved_bound_dev = NULL;
   thr->mapped_data = NULL;
-  
+  thr->prof_info = NULL;
+  thr->api_info = NULL;
+  /* Initially, all callbacks for all events are enabled.  */
+  thr->prof_callbacks_enabled = true;
+  thr->backtrace_state = NULL;
+
   thr->target_tls
     = acc_dev->openacc.create_thread_data_func (ord);
 
@@ -470,9 +538,7 @@ acc_init (acc_device_t d)
   gomp_init_targets_once ();
 
   gomp_mutex_lock (&acc_device_lock);
-
-  cached_base_dev = acc_init_1 (d);
-
+  cached_base_dev = acc_init_1 (NULL, d, acc_construct_runtime_api, 0, 1);
   gomp_mutex_unlock (&acc_device_lock);
   
   goacc_attach_host_thread_to_device (-1);
@@ -531,6 +597,14 @@ acc_set_device_type (acc_device_t d)
   struct gomp_device_descr *base_dev, *acc_dev;
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+  if (profiling_setup_p)
+    prof_info.device_type = d;
+
   gomp_init_targets_once ();
 
   gomp_mutex_lock (&acc_device_lock);
@@ -555,6 +629,12 @@ acc_set_device_type (acc_device_t d)
     }
 
   goacc_attach_host_thread_to_device (-1);
+
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 ialias (acc_set_device_type)
@@ -590,12 +670,24 @@ acc_get_device_type (void)
     ;
   else
     {
+      acc_prof_info prof_info;
+      acc_api_info api_info;
+      bool profiling_setup_p
+	= __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			    false);
+
       gomp_init_targets_once ();
 
       gomp_mutex_lock (&acc_device_lock);
       dev = resolve_device (acc_device_default, true);
       gomp_mutex_unlock (&acc_device_lock);
       res = acc_device_type (dev->type);
+
+      if (profiling_setup_p)
+	{
+	  thr->prof_info = NULL;
+	  thr->api_info = NULL;
+	}
     }
 
   assert (res != acc_device_default
@@ -612,6 +704,14 @@ acc_get_device_num (acc_device_t d)
   const struct gomp_device_descr *dev;
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+  if (profiling_setup_p)
+    prof_info.device_type = d;
+
   if (d >= _ACC_device_hwm)
     gomp_fatal ("unknown device type %u", (unsigned) d);
 
@@ -621,6 +721,12 @@ acc_get_device_num (acc_device_t d)
   dev = resolve_device (d, true);
   gomp_mutex_unlock (&acc_device_lock);
 
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
   if (thr && thr->base_dev == dev && thr->dev)
     return thr->dev->target_id;
 
@@ -735,15 +841,20 @@ goacc_restore_bind (void)
    On exit "goacc_thread" will return a valid & populated thread block.  */
 
 attribute_hidden void
-goacc_lazy_initialize (void)
+goacc_lazy_initialize (int acc_prof_locinfo_skip)
 {
   struct goacc_thread *thr = goacc_thread ();
-
   if (thr && thr->dev)
     return;
 
+  gomp_init_targets_once ();
+
+  gomp_mutex_lock (&acc_device_lock);
   if (!cached_base_dev)
-    acc_init (acc_device_default);
-  else
-    goacc_attach_host_thread_to_device (-1);
+    cached_base_dev = acc_init_1 (thr, acc_device_default,
+				  acc_construct_parallel, 1,
+				  acc_prof_locinfo_skip + 1);
+  gomp_mutex_unlock (&acc_device_lock);
+
+  goacc_attach_host_thread_to_device (-1);
 }
diff --git libgomp/oacc-int.h libgomp/oacc-int.h
index ecc905214ac4..780bec03d156 100644
--- libgomp/oacc-int.h
+++ libgomp/oacc-int.h
@@ -40,6 +40,8 @@
 
 #include "openacc.h"
 #include "config.h"
+#include "acc_prof.h"
+#include "backtrace.h"
 #include <stddef.h>
 #include <stdbool.h>
 #include <stdarg.h>
@@ -68,6 +70,14 @@ struct goacc_thread
      strictly push/pop semantics according to lexical scope.  */
   struct target_mem_desc *mapped_data;
 
+  /* Data of the OpenACC Profiling Interface.  */
+  acc_prof_info *prof_info;
+  acc_api_info *api_info;
+  /* Per-thread toggle of OpenACC Profiling Interface callbacks.  */
+  bool prof_callbacks_enabled;
+  /* Per-thread state of libbacktrace.  */
+  struct backtrace_state *backtrace_state;
+
   /* These structures form a list: this is the next thread in that list.  */
   struct goacc_thread *next;
 
@@ -99,7 +109,7 @@ void goacc_attach_host_thread_to_device (int);
 void goacc_runtime_initialize (void);
 void goacc_save_and_set_bind (acc_device_t);
 void goacc_restore_bind (void);
-void goacc_lazy_initialize (void);
+void goacc_lazy_initialize (int);
 void goacc_host_init (void);
 
 void goacc_init_asyncqueues (struct gomp_device_descr *);
@@ -111,6 +121,16 @@ void goacc_async_free (struct gomp_device_descr *,
 struct goacc_asyncqueue *get_goacc_asyncqueue (int);
 struct goacc_asyncqueue *lookup_goacc_asyncqueue (struct goacc_thread *, bool, int);
 
+void goacc_profiling_initialize (void);
+bool goacc_profiling_setup_p (struct goacc_thread *,
+			      acc_prof_info *, acc_api_info *, int);
+bool goacc_profiling_dispatch_p (bool);
+void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *,
+			       acc_api_info *);
+void goacc_profiling_locinfo_initialize ();
+void goacc_profiling_locinfo_fill (struct goacc_thread *,
+				   acc_prof_info *, int);
+
 #ifdef HAVE_ATTRIBUTE_VISIBILITY
 # pragma GCC visibility pop
 #endif
diff --git libgomp/oacc-mem.c libgomp/oacc-mem.c
index 0e39aff84343..20b9c2090f84 100644
--- libgomp/oacc-mem.c
+++ libgomp/oacc-mem.c
@@ -99,16 +99,31 @@ acc_malloc (size_t s)
   if (!s)
     return NULL;
 
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (1);
 
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+
   assert (thr->dev);
 
+  void *ret;
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-    return malloc (s);
+    ret = malloc (s);
+  else
+    ret = thr->dev->alloc_func (thr->dev->target_id, s);
+
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 
-  return thr->dev->alloc_func (thr->dev->target_id, s);
+  return ret;
 }
 
 /* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
@@ -124,12 +139,22 @@ acc_free (void *d)
 
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+
   assert (thr && thr->dev);
 
   struct gomp_device_descr *acc_dev = thr->dev;
 
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-    return free (d);
+    {
+      free (d);
+
+      goto out;
+    }
 
   gomp_mutex_lock (&acc_dev->lock);
 
@@ -151,16 +176,35 @@ acc_free (void *d)
 
   if (!acc_dev->free_func (acc_dev->target_id, d))
     gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
+
+ out:
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 static void
 memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
-		      const char *libfnname)
+		      const char *libfnname, int acc_prof_locinfo_skip)
 {
   /* No need to call lazy open here, as the device pointer must have
      been obtained from a routine that did that.  */
   struct goacc_thread *thr = goacc_thread ();
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info,
+						 acc_prof_locinfo_skip + 1),
+			false);
+  if (profiling_setup_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   assert (thr && thr->dev);
 
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
@@ -169,7 +213,8 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
 	memmove (h, d, s);
       else
 	memmove (d, h, s);
-      return;
+
+      goto out;
     }
 
   goacc_aq aq = get_goacc_asyncqueue (async);
@@ -177,30 +222,37 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
     gomp_copy_dev2host (thr->dev, aq, h, d, s);
   else
     gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
+
+ out:
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
 acc_memcpy_to_device (void *d, void *h, size_t s)
 {
-  memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__);
+  memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__, 1);
 }
 
 void
 acc_memcpy_to_device_async (void *d, void *h, size_t s, int async)
 {
-  memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__);
+  memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__, 1);
 }
 
 void
 acc_memcpy_from_device (void *h, void *d, size_t s)
 {
-  memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__);
+  memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__, 1);
 }
 
 void
 acc_memcpy_from_device_async (void *h, void *d, size_t s, int async)
 {
-  memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__);
+  memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__, 1);
 }
 
 /* Return the device pointer that corresponds to host data H.  Or NULL
@@ -213,7 +265,7 @@ acc_deviceptr (void *h)
   void *d;
   void *offset;
 
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (1);
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *dev = thr->dev;
@@ -221,6 +273,9 @@ acc_deviceptr (void *h)
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return h;
 
+  /* In the following, no OpenACC Profiling Interface events can possibly be
+     generated.  */
+
   gomp_mutex_lock (&dev->lock);
 
   n = lookup_host (dev, h, 1);
@@ -250,7 +305,7 @@ acc_hostptr (void *d)
   void *h;
   void *offset;
 
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (1);
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
@@ -258,6 +313,9 @@ acc_hostptr (void *d)
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return d;
 
+  /* In the following, no OpenACC Profiling Interface events can possibly be
+     generated.  */
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_dev (acc_dev->openacc.data_environ, d, 1);
@@ -287,7 +345,7 @@ acc_is_present (void *h, size_t s)
   if (!s || !h)
     return 0;
 
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (1);
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
@@ -295,6 +353,9 @@ acc_is_present (void *h, size_t s)
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return h != NULL;
 
+  /* In the following, no OpenACC Profiling Interface events can possibly be
+     generated.  */
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, s);
@@ -321,11 +382,17 @@ acc_map_data (void *h, void *d, size_t s)
   size_t sizes = s;
   unsigned short kinds = GOMP_MAP_ALLOC;
 
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (1);
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     {
       if (d != h)
@@ -366,6 +433,12 @@ acc_map_data (void *h, void *d, size_t s)
   tgt->prev = acc_dev->openacc.data_environ;
   acc_dev->openacc.data_environ = tgt;
   gomp_mutex_unlock (&acc_dev->lock);
+
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -380,6 +453,12 @@ acc_unmap_data (void *h)
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+
   size_t host_size;
 
   gomp_mutex_lock (&acc_dev->lock);
@@ -433,6 +512,12 @@ acc_unmap_data (void *h)
   gomp_mutex_unlock (&acc_dev->lock);
 
   gomp_unmap_vars (t, true);
+
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 #define FLAG_PRESENT (1 << 0)
@@ -440,7 +525,8 @@ acc_unmap_data (void *h)
 #define FLAG_COPY (1 << 2)
 
 static void *
-present_create_copy (unsigned f, void *h, size_t s, int async)
+present_create_copy (unsigned f, void *h, size_t s, int async,
+		     int acc_prof_locinfo_skip)
 {
   void *d;
   splay_tree_key n;
@@ -448,7 +534,7 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
   if (!h || !s)
     gomp_fatal ("[%p,+%d] is a bad range", (void *)h, (int)s);
 
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (1);
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
@@ -456,6 +542,18 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return h;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info,
+						 acc_prof_locinfo_skip + 1),
+			false);
+  if (profiling_setup_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, s);
@@ -518,19 +616,26 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
       gomp_mutex_unlock (&acc_dev->lock);
     }
 
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
+
   return d;
 }
 
 void *
 acc_create (void *h, size_t s)
 {
-  return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync);
+  return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync,
+			      1);
 }
 
 void
 acc_create_async (void *h, size_t s, int async)
 {
-  present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async);
+  present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async, 1);
 }
 
 /* acc_present_or_create used to be what acc_create is now.  */
@@ -556,13 +661,13 @@ void *
 acc_copyin (void *h, size_t s)
 {
   return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s,
-			      acc_async_sync);
+			      acc_async_sync, 1);
 }
 
 void
 acc_copyin_async (void *h, size_t s, int async)
 {
-  present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async);
+  present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async, 1);
 }
 
 /* acc_present_or_copyin used to be what acc_copyin is now.  */
@@ -588,7 +693,8 @@ acc_pcopyin (void *h, size_t s)
 #define FLAG_FINALIZE (1 << 1)
 
 static void
-delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
+delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname,
+		int acc_prof_locinfo_skip)
 {
   size_t host_size;
   splay_tree_key n;
@@ -599,6 +705,18 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
   if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     return;
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info,
+						 acc_prof_locinfo_skip + 1),
+			false);
+  if (profiling_setup_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   gomp_mutex_lock (&acc_dev->lock);
 
   n = lookup_host (acc_dev, h, s);
@@ -672,64 +790,71 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
     }
 
   gomp_mutex_unlock (&acc_dev->lock);
+
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
 acc_delete (void *h , size_t s)
 {
-  delete_copyout (0, h, s, acc_async_sync, __FUNCTION__);
+  delete_copyout (0, h, s, acc_async_sync, __FUNCTION__, 1);
 }
 
 void
 acc_delete_async (void *h , size_t s, int async)
 {
-  delete_copyout (0, h, s, async, __FUNCTION__);
+  delete_copyout (0, h, s, async, __FUNCTION__, 1);
 }
 
 void
 acc_delete_finalize (void *h , size_t s)
 {
-  delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__);
+  delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__, 1);
 }
 
 void
 acc_delete_finalize_async (void *h , size_t s, int async)
 {
-  delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__);
+  delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__, 1);
 }
 
 void
 acc_copyout (void *h, size_t s)
 {
-  delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__);
+  delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__, 1);
 }
 
 void
 acc_copyout_async (void *h, size_t s, int async)
 {
-  delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__);
+  delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__, 1);
 }
 
 void
 acc_copyout_finalize (void *h, size_t s)
 {
   delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, acc_async_sync,
-		  __FUNCTION__);
+		  __FUNCTION__, 1);
 }
 
 void
 acc_copyout_finalize_async (void *h, size_t s, int async)
 {
-  delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__);
+  delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__, 1);
 }
 
 static void
-update_dev_host (int is_dev, void *h, size_t s, int async)
+update_dev_host (int is_dev, void *h, size_t s, int async,
+		 int acc_prof_locinfo_skip)
 {
   splay_tree_key n;
   void *d;
 
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (acc_prof_locinfo_skip + 1);
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
@@ -739,6 +864,18 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
 
   gomp_mutex_lock (&acc_dev->lock);
 
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info,
+						 acc_prof_locinfo_skip + 1),
+			false);
+  if (profiling_setup_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   n = lookup_host (acc_dev, h, s);
 
   if (!n)
@@ -758,30 +895,36 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
     gomp_copy_dev2host (acc_dev, aq, h, d, s);
 
   gomp_mutex_unlock (&acc_dev->lock);
+  
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
 acc_update_device (void *h, size_t s)
 {
-  update_dev_host (1, h, s, acc_async_sync);
+  update_dev_host (1, h, s, acc_async_sync, 1);
 }
 
 void
 acc_update_device_async (void *h, size_t s, int async)
 {
-  update_dev_host (1, h, s, async);
+  update_dev_host (1, h, s, async, 1);
 }
 
 void
 acc_update_self (void *h, size_t s)
 {
-  update_dev_host (0, h, s, acc_async_sync);
+  update_dev_host (0, h, s, acc_async_sync, 1);
 }
 
 void
 acc_update_self_async (void *h, size_t s, int async)
 {
-  update_dev_host (0, h, s, async);
+  update_dev_host (0, h, s, async, 1);
 }
 
 void
diff --git libgomp/oacc-parallel.c libgomp/oacc-parallel.c
index 6a7d068300e4..28ed9c367b19 100644
--- libgomp/oacc-parallel.c
+++ libgomp/oacc-parallel.c
@@ -145,7 +145,8 @@ goacc_call_host_fn (void (*fn) (void *), size_t mapnum, void **hostaddrs,
 static void
 GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
 			       size_t mapnum, void **hostaddrs, size_t *sizes,
-			       unsigned short *kinds, va_list *ap)
+			       unsigned short *kinds, va_list *ap,
+			       int acc_prof_locinfo_skip)
 {
   bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
   struct goacc_thread *thr;
@@ -167,27 +168,79 @@ GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
   gomp_debug (0, "%s: mapnum=%lu, hostaddrs=%p, sizes=%p, kinds=%p\n",
 	      __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds);
 #endif
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (acc_prof_locinfo_skip + 1);
 
   thr = goacc_thread ();
   acc_dev = thr->dev;
 
+  bool profiling_dispatch_p
+    = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+  acc_prof_info prof_info;
+  if (profiling_dispatch_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_compute_construct_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+      goacc_profiling_locinfo_fill (thr, &prof_info, acc_prof_locinfo_skip + 1);
+    }
+  acc_event_info compute_construct_event_info;
+  if (profiling_dispatch_p)
+    {
+      compute_construct_event_info.other_event.event_type
+	= prof_info.event_type;
+      compute_construct_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      compute_construct_event_info.other_event.parent_construct
+	= acc_construct_parallel;
+      compute_construct_event_info.other_event.implicit = 0;
+      compute_construct_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_dispatch_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_dispatch_p)
+    goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+			      &api_info);
+
   handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
 
   /* Host fallback if "if" clause is false or if the current device is set to
      the host.  */
   if (host_fallback)
     {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
       goacc_save_and_set_bind (acc_device_host);
       goacc_call_host_fn (fn, mapnum, hostaddrs, params);
       goacc_restore_bind ();
-      return;
+      goto out;
     }
   else if (acc_device_type (acc_dev->type) == acc_device_host)
     {
       goacc_call_host_fn (fn, mapnum, hostaddrs, params);
-      return;
+      goto out;
     }
+  else if (profiling_dispatch_p)
+    api_info.device_api = acc_device_api_cuda;
 
   /* Default: let the runtime choose.  */
   for (i = 0; i != GOMP_DIM_MAX; i++)
@@ -219,6 +272,13 @@ GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
 
 	    if (async == GOMP_LAUNCH_OP_MAX)
 	      async = va_arg (*ap, unsigned);
+
+	    if (profiling_dispatch_p)
+	      {
+		prof_info.async = async;
+		prof_info.async_queue = prof_info.async;
+	      }
+
 	    break;
 	  }
 
@@ -257,10 +317,34 @@ GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
   else
     tgt_fn = (void (*)) fn;
 
+  acc_event_info enter_exit_data_event_info;
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_enter_data_start;
+      enter_exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      enter_exit_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      enter_exit_data_event_info.other_event.parent_construct
+	= compute_construct_event_info.other_event.parent_construct;
+      enter_exit_data_event_info.other_event.implicit = 1;
+      enter_exit_data_event_info.other_event.tool_info = NULL;
+      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				&api_info);
+    }
+
   goacc_aq aq = get_goacc_asyncqueue (async);
 
   tgt = gomp_map_vars_async (acc_dev, aq, mapnum, hostaddrs, NULL, sizes, kinds,
 			     true, GOMP_MAP_VARS_OPENACC);
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_enter_data_end;
+      enter_exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				&api_info);
+    }
 
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
   for (i = 0; i < mapnum; i++)
@@ -281,8 +365,25 @@ GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
       else
 	acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
 				    dims, tgt);
+      if (profiling_dispatch_p)
+	{
+	  prof_info.event_type = acc_ev_exit_data_start;
+	  enter_exit_data_event_info.other_event.event_type
+	    = prof_info.event_type;
+	  enter_exit_data_event_info.other_event.tool_info = NULL;
+	  goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				    &api_info);
+	}
       /* If running synchronously, unmap immediately.  */
       gomp_unmap_vars (tgt, true);
+      if (profiling_dispatch_p)
+	{
+	  prof_info.event_type = acc_ev_exit_data_end;
+	  enter_exit_data_event_info.other_event.event_type
+	    = prof_info.event_type;
+	  goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				    &api_info);
+	}
     }
   else
     {
@@ -294,6 +395,19 @@ GOACC_parallel_keyed_internal (int device, int params, void (*fn) (void *),
 					  devaddrs, dims, tgt, aq);
       goacc_async_copyout_unmap_vars (tgt, aq);
     }
+
+ out:
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_compute_construct_end;
+      compute_construct_event_info.other_event.event_type
+	= prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+				&api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -304,7 +418,7 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
   va_list ap;
   va_start (ap, kinds);
   GOACC_parallel_keyed_internal (device, 0, fn, mapnum, hostaddrs, sizes,
-				 kinds, &ap);
+				 kinds, &ap, 1);
   va_end (ap);
 }
 
@@ -316,7 +430,7 @@ GOACC_parallel_keyed_v2 (int device, int args, void (*fn) (void *),
   va_list ap;
   va_start (ap, kinds);
   GOACC_parallel_keyed_internal (device, args, fn, mapnum, hostaddrs, sizes,
-				 kinds, &ap);
+				 kinds, &ap, 1);
   va_end (ap);
 }
 
@@ -349,23 +463,84 @@ GOACC_data_start (int device, size_t mapnum,
 	      __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds);
 #endif
 
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (1);
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
+  bool profiling_dispatch_p
+    = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+  acc_prof_info prof_info;
+  if (profiling_dispatch_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_enter_data_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = acc_async_sync; /* Always synchronous.  */
+      prof_info.async_queue = prof_info.async;
+      goacc_profiling_locinfo_fill (thr, &prof_info, 1);
+    }
+  acc_event_info enter_data_event_info;
+  if (profiling_dispatch_p)
+    {
+      enter_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      enter_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      enter_data_event_info.other_event.parent_construct = acc_construct_data;
+      for (int i = 0; i < mapnum; ++i)
+	if (kinds[i] == GOMP_MAP_USE_DEVICE_PTR)
+	  {
+	    /* If there is one such data mapping kind, then this is actually an
+	       OpenACC host_data construct.  (GCC maps the OpenACC host_data
+	       construct to the OpenACC data construct.)  Apart from artificial
+	       test cases (such as an OpenACC host_data construct's (implicit)
+	       device initialization when there hasn't been any device data be
+	       set up before...), there can't really any meaningful events be
+	       generated from OpenACC host_data constructs, though.  */
+	    enter_data_event_info.other_event.parent_construct
+	      = acc_construct_host_data;
+	    break;
+	  }
+      enter_data_event_info.other_event.implicit = 0;
+      enter_data_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_dispatch_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_dispatch_p)
+    goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
+
   handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
 
   /* Host fallback or 'do nothing'.  */
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || host_fallback)
     {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
       tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
 			   GOMP_MAP_VARS_OPENACC);
       tgt->prev = thr->mapped_data;
       thr->mapped_data = tgt;
-
-      return;
+      goto out;
     }
 
   gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
@@ -374,18 +549,86 @@ GOACC_data_start (int device, size_t mapnum,
   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
   tgt->prev = thr->mapped_data;
   thr->mapped_data = tgt;
+
+ out:
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_enter_data_end;
+      enter_data_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &enter_data_event_info, &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
 GOACC_data_end (void)
 {
   struct goacc_thread *thr = goacc_thread ();
+  struct gomp_device_descr *acc_dev = thr->dev;
   struct target_mem_desc *tgt = thr->mapped_data;
 
+  bool profiling_dispatch_p
+    = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+  acc_prof_info prof_info;
+  if (profiling_dispatch_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_exit_data_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = acc_async_sync; /* Always synchronous.  */
+      prof_info.async_queue = prof_info.async;
+      goacc_profiling_locinfo_fill (thr, &prof_info, 1);
+    }
+  acc_event_info exit_data_event_info;
+  if (profiling_dispatch_p)
+    {
+      exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      exit_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      exit_data_event_info.other_event.parent_construct = acc_construct_data;
+      exit_data_event_info.other_event.implicit = 0;
+      exit_data_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_dispatch_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_dispatch_p)
+    goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
+
   gomp_debug (0, "  %s: restore mappings\n", __FUNCTION__);
   thr->mapped_data = tgt->prev;
   gomp_unmap_vars (tgt, true);
   gomp_debug (0, "  %s: mappings restored\n", __FUNCTION__);
+
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_exit_data_end;
+      exit_data_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &exit_data_event_info, &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
@@ -393,6 +636,9 @@ GOACC_enter_exit_data (int device, size_t mapnum,
 		       void **hostaddrs, size_t *sizes, unsigned short *kinds,
 		       int async, int num_waits, ...)
 {
+  struct goacc_thread *thr;
+  struct gomp_device_descr *acc_dev;
+  bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
   bool data_enter = false;
   size_t i;
 
@@ -437,7 +683,67 @@ GOACC_enter_exit_data (int device, size_t mapnum,
 		      kind);
     }
 
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (1);
+
+  thr = goacc_thread ();
+  acc_dev = thr->dev;
+
+  bool profiling_dispatch_p
+    = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+  acc_prof_info prof_info;
+  if (profiling_dispatch_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type
+	= data_enter ? acc_ev_enter_data_start : acc_ev_exit_data_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+      goacc_profiling_locinfo_fill (thr, &prof_info, 1);
+    }
+  acc_event_info enter_exit_data_event_info;
+  if (profiling_dispatch_p)
+    {
+      enter_exit_data_event_info.other_event.event_type
+	= prof_info.event_type;
+      enter_exit_data_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      enter_exit_data_event_info.other_event.parent_construct
+	= data_enter ? acc_construct_enter_data : acc_construct_exit_data;
+      enter_exit_data_event_info.other_event.implicit = 0;
+      enter_exit_data_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_dispatch_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_dispatch_p)
+    goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+			      &api_info);
+
+  if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+      || host_fallback)
+    {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
+      goto out;
+    }
 
   if (num_waits > 0)
     {
@@ -558,6 +864,18 @@ GOACC_enter_exit_data (int device, size_t mapnum,
 	    i += pointer - 1;
 	  }
       }
+
+ out:
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = data_enter ? acc_ev_enter_data_end: acc_ev_exit_data_end;
+      enter_exit_data_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+				&api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 static void
@@ -596,14 +914,64 @@ GOACC_update (int device, size_t mapnum,
   bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
   size_t i;
 
-  goacc_lazy_initialize ();
+  goacc_lazy_initialize (1);
 
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
 
+  bool profiling_dispatch_p
+    = __builtin_expect (goacc_profiling_dispatch_p (true), false);
+
+  acc_prof_info prof_info;
+  if (profiling_dispatch_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_update_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1;
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+      goacc_profiling_locinfo_fill (thr, &prof_info, 1);
+    }
+  acc_event_info update_event_info;
+  if (profiling_dispatch_p)
+    {
+      update_event_info.other_event.event_type
+	= prof_info.event_type;
+      update_event_info.other_event.valid_bytes
+	= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      update_event_info.other_event.parent_construct = acc_construct_update;
+      update_event_info.other_event.implicit = 0;
+      update_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_dispatch_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1;
+      api_info.device_handle = NULL;
+      api_info.context_handle = NULL;
+      api_info.async_handle = NULL;
+    }
+
+  if (profiling_dispatch_p)
+    goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || host_fallback)
-    return;
+    {
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
+      goto out;
+    }
 
   if (num_waits > 0)
     {
@@ -675,11 +1043,40 @@ GOACC_update (int device, size_t mapnum,
 	  break;
 	}
     }
+
+ out:
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_update_end;
+      update_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &update_event_info, &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 void
 GOACC_wait (int async, int num_waits, ...)
 {
+  goacc_lazy_initialize (1);
+
+  struct goacc_thread *thr = goacc_thread ();
+
+  /* No nesting.  */
+  assert (thr->prof_info == NULL);
+  assert (thr->api_info == NULL);
+  acc_prof_info prof_info;
+  acc_api_info api_info;
+  bool profiling_setup_p
+    = __builtin_expect (goacc_profiling_setup_p (thr, &prof_info, &api_info, 1),
+			false);
+  if (profiling_setup_p)
+    {
+      prof_info.async = async;
+      prof_info.async_queue = prof_info.async;
+    }
+
   if (num_waits)
     {
       va_list ap;
@@ -692,6 +1089,12 @@ GOACC_wait (int async, int num_waits, ...)
     acc_wait_all ();
   else if (async == acc_async_noval)
     acc_wait_all_async (async);
+
+  if (profiling_setup_p)
+    {
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 int
diff --git libgomp/oacc-plugin.c libgomp/oacc-plugin.c
index a7eecdc3bbc7..10a1169dc486 100644
--- libgomp/oacc-plugin.c
+++ libgomp/oacc-plugin.c
@@ -39,6 +39,19 @@ GOMP_PLUGIN_acc_thread (void)
   return thr ? thr->target_tls : NULL;
 }
 
+/* Return the TLS data for the current thread.  */
+/* TODO.  Should we be able to directly call (the static inline function)
+   goacc_thread from within plugin code?  I didn't manage to get the
+   "goacc_tls_data" symbol configured correctly: "[...]/ld:
+   .libs/libgomp-plugin-nvptx.so.1.0.0: hidden symbol `goacc_tls_data' isn't
+   defined".  */
+
+struct goacc_thread *
+GOMP_PLUGIN_goacc_thread (void)
+{
+  return goacc_thread ();
+}
+
 /* Return the default async number from the TLS data for the current thread.  */
 
 int
diff --git libgomp/oacc-plugin.h libgomp/oacc-plugin.h
index adf04d62b4af..52949ca704b0 100644
--- libgomp/oacc-plugin.h
+++ libgomp/oacc-plugin.h
@@ -27,8 +27,11 @@
 #ifndef OACC_PLUGIN_H
 #define OACC_PLUGIN_H 1
 
+#include "oacc-int.h"
+
 extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
 extern void *GOMP_PLUGIN_acc_thread (void);
+extern struct goacc_thread *GOMP_PLUGIN_goacc_thread (void);
 extern int GOMP_PLUGIN_acc_thread_default_async (void);
 
 #endif
diff --git libgomp/oacc-profiling-acc_register_library.c libgomp/oacc-profiling-acc_register_library.c
new file mode 100644
index 000000000000..f6b482b51f4b
--- /dev/null
+++ libgomp/oacc-profiling-acc_register_library.c
@@ -0,0 +1,39 @@
+/* Copyright (C) 2017 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file provides an stub acc_register_library function.  It's in a
+   separate file so that this function can easily be overridden when linking
+   statically.  */
+
+#include "libgomp.h"
+#include "acc_prof.h"
+
+void
+acc_register_library (acc_prof_reg reg, acc_prof_reg unreg,
+		      acc_prof_lookup_func lookup)
+{
+  gomp_debug (0, "dummy %s\n", __FUNCTION__);
+}
diff --git libgomp/oacc-profiling-locinfo.c libgomp/oacc-profiling-locinfo.c
new file mode 100644
index 000000000000..c139fd70b6f7
--- /dev/null
+++ libgomp/oacc-profiling-locinfo.c
@@ -0,0 +1,138 @@
+/* Copyright (C) 2018 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* OpenACC Profiling Interface: location information.  */
+
+#include "libgomp.h"
+#include "oacc-int.h"
+#include "backtrace-supported.h"
+#ifdef HAVE_STRING_H
+# include <string.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+
+
+/* Initialize.  */
+void
+goacc_profiling_locinfo_initialize ()
+{
+  if (!BACKTRACE_SUPPORTED)
+    gomp_debug (0, "libbacktrace not supported:"
+		" can't provide location information in"
+		" the OpenACC Profiling Interface\n");
+}
+
+/* Initialize for "thr" if not yet done.  */
+static void
+goacc_profiling_locinfo_initialize_thr (struct goacc_thread *thr)
+{
+  if (__builtin_expect (thr->backtrace_state == NULL, false))
+    {
+      /* Separate state per thread, to avoid locking.  */
+      thr->backtrace_state = backtrace_create_state (NULL, 0, NULL, NULL);
+      if (thr->backtrace_state == NULL)
+	gomp_fatal ("failed to create state information for libbacktrace");
+    }
+}
+
+static void
+error_callback (void *data, const char *msg, int errnum)
+{
+  if (errnum < 0)
+    gomp_debug (0, "could not generate backtrace: %s\n", msg);
+  else if (errnum == 0)
+    gomp_debug (0, "could not generate backtrace: %s\n", msg);
+  else
+    gomp_debug (0, "could not generate backtrace: %s: %s\n", msg, strerror (errnum));
+}
+
+static int
+full_callback (void *data, uintptr_t pc, const char *filename, int lineno, const char *function)
+{
+  gomp_debug (0, "  libbacktrace found for PC '0x"
+#ifdef HAVE_INTTYPES_H
+	      "%" PRIxPTR
+#else
+	      "%lu"
+#endif
+	      "': filename '%s', lineno '%d', function '%s'\n",
+#ifndef HAVE_INTTYPES_H
+	      (unsigned long)
+#endif
+	      pc,
+	      filename ?: "NULL",
+	      lineno,
+	      function ?: "NULL");
+
+  acc_prof_info *prof_info = (acc_prof_info *) data;
+
+  /* "backtrace.h" states that "the FILENAME and FUNCTION buffers may become
+     invalid after this function returns".  But given that OpenACC 2.5 states
+     that "if the library wants to save [these], it should allocate memory and
+     copy the string[s]", it is fine to pass the pointers here.  */
+  prof_info->src_file = filename;
+  prof_info->func_name = function;
+  if (lineno > 0)
+    prof_info->line_no = lineno;
+  else
+    prof_info->line_no = -1;
+
+  /* Stop here; we got what we need.  */
+  return 1;
+}
+
+void goacc_profiling_locinfo_fill (struct goacc_thread *thr,
+				   acc_prof_info *prof_info, int skip)
+{
+  /* Default: all unknown.  */
+  prof_info->src_file = NULL;
+  prof_info->func_name = NULL;
+  prof_info->line_no = -1;
+  prof_info->end_line_no = -1;
+  prof_info->func_line_no = -1;
+  prof_info->func_end_line_no = -1;
+
+  if (!BACKTRACE_SUPPORTED)
+    {
+      /* We diagnosed this in goacc_profiling_locinfo_initialize.  */
+      return;
+    }
+
+  /* Special case for oacc-init.c:acc_init_1.  */
+  /* See also oacc-profiling.c:goacc_profiling_setup_p.  */
+  if (__builtin_expect (thr == NULL, false))
+    {
+      gomp_debug (0, "Can't look up location information for"
+		  " the current call, construct, or directive\n");
+      return;
+    }
+
+  goacc_profiling_locinfo_initialize_thr (thr);
+
+  backtrace_full (thr->backtrace_state, skip, full_callback, error_callback, prof_info);
+}
diff --git libgomp/oacc-profiling.c libgomp/oacc-profiling.c
new file mode 100644
index 000000000000..6cbc766efd3e
--- /dev/null
+++ libgomp/oacc-profiling.c
@@ -0,0 +1,650 @@
+/* Copyright (C) 2017 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* OpenACC Profiling Interface.  */
+
+#define _GNU_SOURCE
+#include "libgomp.h"
+#include "oacc-int.h"
+#include "secure_getenv.h"
+#include "acc_prof.h"
+#include <assert.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#endif
+#ifdef PLUGIN_SUPPORT
+# include <dlfcn.h>
+#endif
+
+#define STATIC_ASSERT(expr) _Static_assert (expr, "!(" #expr ")")
+
+/* Statically assert that the layout of the common fields in the
+   "acc_event_info" variants matches.  */
+/* event_type */
+STATIC_ASSERT (offsetof (acc_event_info, event_type)
+	       == offsetof (acc_event_info, data_event.event_type));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
+	       == offsetof (acc_event_info, launch_event.event_type));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
+	       == offsetof (acc_event_info, other_event.event_type));
+/* valid_bytes */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
+	       == offsetof (acc_event_info, launch_event.valid_bytes));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
+	       == offsetof (acc_event_info, other_event.valid_bytes));
+/* parent_construct */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
+	       == offsetof (acc_event_info, launch_event.parent_construct));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
+	       == offsetof (acc_event_info, other_event.parent_construct));
+/* implicit */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
+	       == offsetof (acc_event_info, launch_event.implicit));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
+	       == offsetof (acc_event_info, other_event.implicit));
+/* tool_info */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
+	       == offsetof (acc_event_info, launch_event.tool_info));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
+	       == offsetof (acc_event_info, other_event.tool_info));
+
+struct goacc_prof_callback_entry
+{
+  acc_prof_callback cb;
+  int ref;
+  bool enabled;
+  struct goacc_prof_callback_entry *next;
+};
+
+/* Using a separate flag to minimize run-time performance impact in the (very
+   common) case that profiling is not enabled.  */
+static bool goacc_prof_enabled;
+/* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global toggle.  */
+static bool goacc_prof_callbacks_enabled[acc_ev_last];
+static struct goacc_prof_callback_entry *goacc_prof_callback_entries[acc_ev_last];
+
+/* This lock is used to protect access to goacc_prof_callbacks_enabled, and
+   goacc_prof_callback_entries.  */
+static gomp_mutex_t goacc_prof_lock;
+
+void
+goacc_profiling_initialize (void)
+{
+  gomp_mutex_init (&goacc_prof_lock);
+
+  /* Initially, all callbacks for all events are enabled.  */
+  for (int i = 0; i < acc_ev_last; ++i)
+    goacc_prof_callbacks_enabled[i] = true;
+  /* ..., but profiling is still disabled.  */
+  __atomic_store_n (&goacc_prof_enabled, false, MEMMODEL_RELAXED);
+
+  /* We are to invoke an external acc_register_library routine, defaulting to
+     our stub oacc-profiling-acc_register_library.c:acc_register_library
+     implementation.  */
+  gomp_debug (0, "%s: calling acc_register_library\n", __FUNCTION__);
+  acc_register_library (acc_prof_register, acc_prof_unregister, NULL);
+#ifdef PLUGIN_SUPPORT
+  char *acc_proflibs = secure_getenv ("ACC_PROFLIB");
+  while (acc_proflibs != NULL && acc_proflibs[0] != '\0')
+    {
+      char *acc_proflibs_sep = strchr (acc_proflibs, ';');
+      char *acc_proflib;
+      if (acc_proflibs_sep == acc_proflibs)
+	{
+	  /* Stray ";" separator: make sure we don't dlopen the main
+	     program.  */
+	  acc_proflib = NULL;
+	}
+      else
+	{
+	  if (acc_proflibs_sep != NULL)
+	    {
+	      /* Single out the first library.  */
+	      acc_proflib = gomp_malloc (acc_proflibs_sep - acc_proflibs + 1);
+	      memcpy (acc_proflib, acc_proflibs,
+		      acc_proflibs_sep - acc_proflibs);
+	      acc_proflib[acc_proflibs_sep - acc_proflibs] = '\0';
+	    }
+	  else
+	    {
+	      /* No ";" separator, so only one library.  */
+	      acc_proflib = acc_proflibs;
+	    }
+
+	  gomp_debug (0, "%s: dlopen(%s)\n", __FUNCTION__, acc_proflib);
+	  void *dl_handle = dlopen (acc_proflib, RTLD_LAZY);
+	  if (dl_handle != NULL)
+	    {
+	      typeof (&acc_register_library) a_r_l
+		= dlsym (dl_handle, "acc_register_library");
+	      if (a_r_l == NULL)
+		goto dl_fail;
+	      /* Avoid duplicate registration, for example if the same shared
+		 library is specified in LD_PRELOAD and ACC_PROFLIB -- which
+		 TAU 2.26 does when using "tau_exec -openacc".  */
+	      if (a_r_l == acc_register_library)
+		gomp_debug (0, "  %s: skipping duplicate"
+			    " %s:acc_register_library\n",
+			    __FUNCTION__, acc_proflib);
+	      else
+		{
+		  gomp_debug (0, "  %s: calling %s:acc_register_library\n",
+			      __FUNCTION__, acc_proflib);
+		  a_r_l (acc_prof_register, acc_prof_unregister, NULL);
+		}
+	    }
+	  else
+	    {
+	    dl_fail:
+	      gomp_error ("while loading ACC_PROFLIB %s: %s",
+			  acc_proflib, dlerror ());
+	      if (dl_handle != NULL)
+		{
+		  int err = dlclose (dl_handle);
+		  dl_handle = NULL;
+		  if (err != 0)
+		    goto dl_fail;
+		}
+	    }
+	}
+
+      if (acc_proflib != acc_proflibs)
+	{
+	  free (acc_proflib);
+
+	  acc_proflibs = acc_proflibs_sep + 1;
+	}
+      else
+	acc_proflibs = NULL;
+    }
+#endif /* PLUGIN_SUPPORT */
+
+  goacc_profiling_locinfo_initialize ();
+}
+
+void
+acc_prof_register (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
+{
+  __atomic_store_n (&goacc_prof_enabled, true, MEMMODEL_RELAXED);
+
+  gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
+	      __FUNCTION__, (int) ev, (void *) cb, (int) reg);
+
+  enum
+  {
+    EVENT_KIND_BOGUS,
+    EVENT_KIND_NORMAL,
+    /* As end events invoke callbacks in the reverse order, we register these
+       in the reverse order here.  */
+    EVENT_KIND_END,
+  } event_kind = EVENT_KIND_BOGUS;
+  switch (ev)
+    {
+    case acc_ev_none:
+    case acc_ev_device_init_start:
+    case acc_ev_device_shutdown_start:
+    case acc_ev_runtime_shutdown:
+    case acc_ev_create:
+    case acc_ev_delete:
+    case acc_ev_alloc:
+    case acc_ev_free:
+    case acc_ev_enter_data_start:
+    case acc_ev_exit_data_start:
+    case acc_ev_update_start:
+    case acc_ev_compute_construct_start:
+    case acc_ev_enqueue_launch_start:
+    case acc_ev_enqueue_upload_start:
+    case acc_ev_enqueue_download_start:
+    case acc_ev_wait_start:
+      event_kind = EVENT_KIND_NORMAL;
+      break;
+    case acc_ev_device_init_end:
+    case acc_ev_device_shutdown_end:
+    case acc_ev_enter_data_end:
+    case acc_ev_exit_data_end:
+    case acc_ev_update_end:
+    case acc_ev_compute_construct_end:
+    case acc_ev_enqueue_launch_end:
+    case acc_ev_enqueue_upload_end:
+    case acc_ev_enqueue_download_end:
+    case acc_ev_wait_end:
+      event_kind = EVENT_KIND_END;
+      break;
+    case acc_ev_last:
+      break;
+    }
+  if (event_kind == EVENT_KIND_BOGUS)
+    {
+      gomp_error ("ignoring %s request for invalid acc_event_t %d",
+		  __FUNCTION__, (int) ev);
+      return;
+    }
+
+  bool bogus = true;
+  switch (reg)
+    {
+    case acc_reg:
+    case acc_toggle:
+    case acc_toggle_per_thread:
+      bogus = false;
+      break;
+    }
+  if (bogus)
+    {
+      gomp_error ("ignoring %s request with invalid acc_register_t %d",
+		  __FUNCTION__, (int) reg);
+      return;
+    }
+
+  /* Special cases.  */
+  if (reg == acc_toggle)
+    {
+      if (cb == NULL)
+	{
+	  gomp_debug (0, "  globally enabling callbacks\n");
+	  gomp_mutex_lock (&goacc_prof_lock);
+	  /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global
+	     toggle.  */
+	  goacc_prof_callbacks_enabled[ev] = true;
+	  gomp_mutex_unlock (&goacc_prof_lock);
+	  return;
+	}
+      else if (ev == acc_ev_none && cb != NULL)
+	{
+	  gomp_debug (0, "  ignoring request\n");
+	  /* Silently ignore request.  */
+	  return;
+	}
+    }
+  else if (reg == acc_toggle_per_thread)
+    {
+      if (ev == acc_ev_none && cb == NULL)
+	{
+	  gomp_debug (0, "  thread: enabling callbacks\n");
+	  goacc_lazy_initialize (1);
+	  struct goacc_thread *thr = goacc_thread ();
+	  thr->prof_callbacks_enabled = true;
+	  return;
+	}
+      gomp_error ("ignoring %s request for acc_toggle_per_thread",
+		  __FUNCTION__);
+      return;
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  struct goacc_prof_callback_entry *it, *it_p;
+  it = goacc_prof_callback_entries[ev];
+  it_p = NULL;
+  while (it)
+    {
+      if (it->cb == cb)
+	break;
+      it_p = it;
+      it = it->next;
+    }
+
+  switch (reg)
+    {
+    case acc_reg:
+      /* If we already have this callback registered, just increment its ref
+	 count.  */
+      if (it != NULL)
+	{
+	  it->ref++;
+	  gomp_debug (0, "  already registered;"
+		      " incrementing ref count to: %d\n", it->ref);
+	}
+      else
+	{
+	  struct goacc_prof_callback_entry *e
+	    = gomp_malloc (sizeof (struct goacc_prof_callback_entry));
+	  e->cb = cb;
+	  e->ref = 1;
+	  e->enabled = true;
+	  bool prepend = (event_kind == EVENT_KIND_END);
+	  /* If we don't have any callback registered yet, also use the
+	     "prepend" code path.  */
+	  if (it_p == NULL)
+	    prepend = true;
+	  if (prepend)
+	    {
+	      gomp_debug (0, "  prepending\n");
+	      e->next = goacc_prof_callback_entries[ev];
+	      goacc_prof_callback_entries[ev] = e;
+	    }
+	  else
+	    {
+	      gomp_debug (0, "  appending\n");
+	      e->next = NULL;
+	      it_p->next = e;
+	    }
+	}
+      break;
+
+    case acc_toggle:
+      if (it == NULL)
+	{
+	  /* Silently ignore acc_toggle request if not registered.  */
+	  gomp_debug (0, "  not enabling; not registered\n");
+	}
+      else
+	{
+	  gomp_debug (0, "  enabling\n");
+	  it->enabled = true;
+	}
+      break;
+
+    case acc_toggle_per_thread:
+      __builtin_unreachable ();
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
+
+void
+acc_prof_unregister (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
+{
+  gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
+	      __FUNCTION__, (int) ev, (void *) cb, (int) reg);
+
+  if (ev < acc_ev_none
+      || ev >= acc_ev_last)
+    {
+      gomp_error ("ignoring %s request for invalid acc_event_t %d",
+		  __FUNCTION__, (int) ev);
+      return;
+    }
+
+  bool bogus = true;
+  switch (reg)
+    {
+    case acc_reg:
+    case acc_toggle:
+    case acc_toggle_per_thread:
+      bogus = false;
+      break;
+    }
+  if (bogus)
+    {
+      gomp_error ("ignoring %s request with invalid acc_register_t %d",
+		  __FUNCTION__, (int) reg);
+      return;
+    }
+
+  /* Special cases.  */
+  if (reg == acc_toggle)
+    {
+      if (cb == NULL)
+	{
+	  gomp_debug (0, "  globally disabling callbacks\n");
+	  gomp_mutex_lock (&goacc_prof_lock);
+	  /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global
+	     toggle.  */
+	  goacc_prof_callbacks_enabled[ev] = false;
+	  gomp_mutex_unlock (&goacc_prof_lock);
+	  return;
+	}
+      else if (ev == acc_ev_none && cb != NULL)
+	{
+	  gomp_debug (0, "  ignoring request\n");
+	  /* Silently ignore request.  */
+	  return;
+	}
+    }
+  else if (reg == acc_toggle_per_thread)
+    {
+      if (ev == acc_ev_none && cb == NULL)
+	{
+	  gomp_debug (0, "  thread: disabling callbacks\n");
+	  goacc_lazy_initialize (1);
+	  struct goacc_thread *thr = goacc_thread ();
+	  thr->prof_callbacks_enabled = false;
+	  return;
+	}
+      gomp_error ("ignoring %s request for acc_toggle_per_thread",
+		  __FUNCTION__);
+      return;
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  struct goacc_prof_callback_entry *it, *it_p;
+  it = goacc_prof_callback_entries[ev];
+  it_p = NULL;
+  while (it)
+    {
+      if (it->cb == cb)
+	break;
+      it_p = it;
+      it = it->next;
+    }
+
+  switch (reg)
+    {
+    case acc_reg:
+      if (it == NULL)
+	{
+	  gomp_error ("ignoring %s request for acc_event_t %d: not registered",
+		      __FUNCTION__, (int) ev);
+	  gomp_mutex_unlock (&goacc_prof_lock);
+	  return;
+	}
+      it->ref--;
+      gomp_debug (0, "  decrementing ref count to: %d\n", it->ref);
+      if (it->ref == 0)
+	{
+	  if (it_p == NULL)
+	    goacc_prof_callback_entries[ev] = it->next;
+	  else
+	    it_p->next = it->next;
+	  free (it);
+	}
+      break;
+
+    case acc_toggle:
+      if (it == NULL)
+	{
+	  /* Silently ignore acc_toggle request if not registered.  */
+	  gomp_debug (0, "  not disabling; not registered\n");
+	}
+      else
+	{
+	  gomp_debug (0, "  disabling\n");
+	  it->enabled = false;
+	}
+      break;
+
+    case acc_toggle_per_thread:
+      __builtin_unreachable ();
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
+
+/* Set up to dispatch events?  */
+
+bool
+goacc_profiling_setup_p (struct goacc_thread *thr,
+			 acc_prof_info *prof_info, acc_api_info *api_info,
+			 int acc_prof_locinfo_skip)
+{
+  gomp_debug (0, "%s (%p)\n", __FUNCTION__, thr);
+
+  /* If we don't have any per-thread state yet, we can't register prof_info and
+     api_info.  */
+  /* See also oacc-profiling-locinfo.c:goacc_profiling_locinfo_fill.  */
+  if (__builtin_expect (thr == NULL, false))
+    {
+      gomp_debug (0, "Can't generate OpenACC Profiling Interface events for"
+		  " the current call, construct, or directive\n");
+      return false;
+    }
+
+  bool profiling_dispatch_p
+    = __builtin_expect (goacc_profiling_dispatch_p (false), false);
+  if (thr->prof_info != NULL)
+    {
+      assert (profiling_dispatch_p);
+      /* Profiling has already been set up for an outer construct.  In this
+	 case, we continue to use the existing information, and thus return
+	 "false" here.
+
+	 This can happen, for example, for an enter data directive, which sets
+	 up profiling, then calls into acc_copyin, which should not again set
+	 up profiling, should not overwrite the existing information.  */
+      return false;
+    }
+
+  if (profiling_dispatch_p)
+    {
+      thr->prof_info = prof_info;
+
+      prof_info->event_type = -1; /* Must be set later.  */
+      prof_info->valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info->version = _ACC_PROF_INFO_VERSION;
+      if (thr->dev)
+	{
+	  prof_info->device_type = acc_device_type (thr->dev->type);
+	  prof_info->device_number = thr->dev->target_id;
+	}
+      else
+	{
+	  prof_info->device_type = -1;
+	  prof_info->device_number = -1;
+	}
+      prof_info->thread_id = -1;
+      prof_info->async = acc_async_sync;
+      prof_info->async_queue = prof_info->async;
+      goacc_profiling_locinfo_fill (thr, prof_info, acc_prof_locinfo_skip + 1);
+
+      thr->api_info = api_info;
+
+      api_info->device_api = acc_device_api_none;
+      api_info->valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info->device_type = prof_info->device_type;
+      api_info->vendor = -1;
+      api_info->device_handle = NULL;
+      api_info->context_handle = NULL;
+      api_info->async_handle = NULL;
+    }
+
+  return profiling_dispatch_p;
+}
+
+/* Prepare to dispatch events?  */
+
+bool
+goacc_profiling_dispatch_p (bool check_not_nested_p)
+{
+  if (__builtin_expect (__atomic_load_n (&goacc_prof_enabled,
+					 MEMMODEL_RELAXED) != true, true))
+    return false;
+
+  gomp_debug (0, "%s\n", __FUNCTION__);
+
+  struct goacc_thread *thr = goacc_thread ();
+  if (__builtin_expect (thr == NULL, false))
+    {
+      /* If we don't have any per-thread state yet, that means that per-thread
+	 callback dispatch has not been explicitly disabled (which only a call
+	 to acc_prof_unregister with acc_toggle_per_thread will do, and that
+	 would have allocated per-thread state via goacc_lazy_initialize);
+	 initially, all callbacks for all events are enabled.  */
+      gomp_debug (0, "  %s: don't have any per-thread state yet\n", __FUNCTION__);
+    }
+  else
+    {
+      if (check_not_nested_p)
+	{
+	  /* No nesting.  */
+	  assert (thr->prof_info == NULL);
+	  assert (thr->api_info == NULL);
+	}
+
+      if (__builtin_expect (!thr->prof_callbacks_enabled, true))
+	{
+	  gomp_debug (0, "  %s: disabled for this thread\n", __FUNCTION__);
+	  return false;
+	}
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global toggle.  */
+  if (__builtin_expect (!goacc_prof_callbacks_enabled[acc_ev_none], true))
+    {
+      gomp_debug (0, "  %s: disabled globally\n", __FUNCTION__);
+      gomp_mutex_unlock (&goacc_prof_lock);
+      return false;
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+
+  return true;
+}
+
+/* Dispatch events.
+
+   This must only be called if goacc_profiling_dispatch_p returned a true
+   result.  */
+
+void
+goacc_profiling_dispatch (acc_prof_info *prof_info, acc_event_info *event_info,
+			  acc_api_info *apt_info)
+{
+  acc_event_t event_type = event_info->event_type;
+  gomp_debug (0, "%s: event_type=%d\n", __FUNCTION__, (int) event_type);
+  assert (event_type > acc_ev_none
+	  && event_type < acc_ev_last);
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  if (!goacc_prof_callbacks_enabled[event_type])
+    {
+      gomp_debug (0, "  %s: disabled for this event type\n", __FUNCTION__);
+      gomp_mutex_unlock (&goacc_prof_lock);
+      return;
+    }
+
+  for (struct goacc_prof_callback_entry *e
+	 = goacc_prof_callback_entries[event_type];
+       e != NULL;
+       e = e->next)
+    {
+      if (!e->enabled)
+	{
+	  gomp_debug (0, "  %s: disabled for callback %p\n",
+		      __FUNCTION__, e->cb);
+	  continue;
+	}
+
+      gomp_debug (0, "  %s: calling callback %p\n", __FUNCTION__, e->cb);
+      e->cb (prof_info, event_info, apt_info);
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
diff --git libgomp/plugin/plugin-nvptx.c libgomp/plugin/plugin-nvptx.c
index 6aac13cca960..f036d04a5e9d 100644
--- libgomp/plugin/plugin-nvptx.c
+++ libgomp/plugin/plugin-nvptx.c
@@ -36,6 +36,7 @@
 #include "libgomp-plugin.h"
 #include "oacc-plugin.h"
 #include "gomp-constants.h"
+#include "oacc-int.h"
 
 #include <pthread.h>
 #include <cuda.h>
@@ -861,11 +862,52 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   // num_workers	ntid.y
   // vector length	ntid.x
 
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info enqueue_launch_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_launch_start;
+
+      enqueue_launch_event_info.launch_event.event_type
+	= prof_info->event_type;
+      enqueue_launch_event_info.launch_event.valid_bytes
+	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
+      enqueue_launch_event_info.launch_event.parent_construct
+	= acc_construct_parallel;
+      enqueue_launch_event_info.launch_event.implicit = 1;
+      enqueue_launch_event_info.launch_event.tool_info = NULL;
+      enqueue_launch_event_info.launch_event.kernel_name
+	= targ_fn->launch->fn;
+      enqueue_launch_event_info.launch_event.num_gangs
+	= dims[GOMP_DIM_GANG];
+      enqueue_launch_event_info.launch_event.num_workers
+	= dims[GOMP_DIM_WORKER];
+      enqueue_launch_event_info.launch_event.vector_length
+	= dims[GOMP_DIM_VECTOR];
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
+					    api_info);
+    }
+  
   CUDA_CALL_ASSERT (cuLaunchKernel, function,
 		    dims[GOMP_DIM_GANG], 1, 1,
 		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 		    0, stream, kargs, 0);
 
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_launch_end;
+      enqueue_launch_event_info.launch_event.event_type
+	= prof_info->event_type;
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
+					    api_info);
+    }
+
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
 		     targ_fn->launch->fn);
 }
@@ -878,6 +920,36 @@ nvptx_alloc (size_t s)
   CUdeviceptr d;
 
   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  bool profiling_dispatch_p
+    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      acc_prof_info *prof_info = thr->prof_info;
+      acc_event_info data_event_info;
+      acc_api_info *api_info = thr->api_info;
+
+      prof_info->event_type = acc_ev_alloc;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+	= acc_construct_parallel;
+      data_event_info.data_event.implicit = 1;
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL;
+      data_event_info.data_event.bytes = s;
+      data_event_info.data_event.host_ptr = NULL;
+      data_event_info.data_event.device_ptr = (void *) d;
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+					    api_info);
+    }
+
   return (void *) d;
 }
 
@@ -1253,11 +1325,47 @@ openacc_exec_internal (void (*fn) (void *), int params, size_t mapnum,
 
   /* Copy the (device) pointers to arguments to the device (dp and hp might in
      fact have the same value on a unified-memory system).  */
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_upload_start;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+	= acc_construct_parallel;
+      /* Always implicit for "data mapping arguments for cuLaunchKernel".  */
+      data_event_info.data_event.implicit = 1;
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL;
+      data_event_info.data_event.bytes = mapnum * sizeof (void *);
+      data_event_info.data_event.host_ptr = hp;
+      if (!params)
+	data_event_info.data_event.device_ptr = (void *) dp;
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+					    api_info);
+    }
 
   if (!params && mapnum > 0)
     CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
 		      mapnum * sizeof (void *));
 
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_upload_end;
+      data_event_info.data_event.event_type = prof_info->event_type;
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+					    api_info);
+    }
+
   if (params)
     nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
 		hp, NULL);
@@ -1338,6 +1446,34 @@ openacc_async_exec_internal (void (*fn) (void *), int params, size_t mapnum,
 
   /* Copy the (device) pointers to arguments to the device (dp and hp might in
      fact have the same value on a unified-memory system).  */
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_upload_start;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+	= acc_construct_parallel;
+      /* Always implicit for "data mapping arguments for cuLaunchKernel".  */
+      data_event_info.data_event.implicit = 1;
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL;
+      data_event_info.data_event.bytes = mapnum * sizeof (void *);
+      data_event_info.data_event.host_ptr = hp;
+      if (!params)
+	data_event_info.data_event.device_ptr = (void *) dp;
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+					    api_info);
+    }
 
   if (!params && mapnum > 0)
     {
@@ -1350,6 +1486,14 @@ openacc_async_exec_internal (void (*fn) (void *), int params, size_t mapnum,
       block[1] = (void *) nvthd->ptx_dev;
     }
 
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_upload_end;
+      data_event_info.data_event.event_type = prof_info->event_type;
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+					    api_info);
+    }
+
   if (params)
     nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
 		hp, aq->cuda_stream);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
new file mode 100644
index 000000000000..bf31a4385597
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
@@ -0,0 +1,350 @@
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+   which will cause the value at the point of call to be used (*before* any
+   potential modifications done in callbacks), as opposed to its address being
+   taken, which then later gets dereferenced (*after* any modifications done in
+   callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+#define STATE_OP(state, op)\
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+void cb_compute_construct_start_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 0
+	  || state == 10
+	  || state == 30
+	  || state == 41
+	  || state == 51
+	  || state == 91
+	  || state == 101
+	  || state == 151);
+  STATE_OP (state, ++);
+}
+
+void cb_compute_construct_start_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 1
+	  || state == 11
+	  || state == 40
+	  || state == 50
+	  || state == 90
+	  || state == 100
+	  || state == 150);
+  STATE_OP (state, ++);
+}
+
+void cb_compute_construct_end_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 14
+	  || state == 21
+	  || state == 32
+	  || state == 42
+	  || state == 80
+	  || state == 103
+	  || state == 152);
+  STATE_OP (state, ++);
+}
+
+void cb_compute_construct_end_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 13
+	  || state == 43
+	  || state == 102
+	  || state == 154);
+  STATE_OP (state, ++);
+}
+
+void cb_compute_construct_end_3 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 12
+	  || state == 20
+	  || state == 31
+	  || state == 44
+	  || state == 81
+	  || state == 104
+	  || state == 153);
+  STATE_OP (state, ++);
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 2);
+  }
+  assert (state == 2);
+
+  STATE_OP (state, = 10);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 12);
+  }
+  assert (state == 15);
+
+  STATE_OP (state, = 20);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 20);
+  }
+  assert (state == 20);
+
+  STATE_OP (state, = 30);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 31);
+  }
+  assert (state == 33);
+
+  STATE_OP (state, = 40);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 42);
+  }
+  assert (state == 45);
+
+  STATE_OP (state, = 50);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 52);
+  }
+  assert (state == 52);
+
+  STATE_OP (state, = 60);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 60);
+  }
+  assert (state == 60);
+
+  STATE_OP (state, = 70);
+  unreg (acc_ev_compute_construct_start, NULL, acc_toggle);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 70);
+  }
+  assert (state == 70);
+
+  STATE_OP (state, = 80);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 80);
+  }
+  assert (state == 82);
+
+  STATE_OP (state, = 90);
+  reg (acc_ev_compute_construct_start, NULL, acc_toggle);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 92);
+  }
+  assert (state == 92);
+
+  STATE_OP (state, = 100);
+  reg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 102);
+  }
+  assert (state == 105);
+
+  STATE_OP (state, = 110);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 110);
+  }
+  assert (state == 110);
+
+  STATE_OP (state, = 120);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 120);
+  }
+  assert (state == 120);
+
+  STATE_OP (state, = 130);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 130);
+  }
+  assert (state == 130);
+
+  STATE_OP (state, = 140);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 140);
+  }
+  assert (state == 140);
+
+  STATE_OP (state, = 150);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 152);
+  }
+  assert (state == 155);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1-debug_info.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1-debug_info.c
new file mode 100644
index 000000000000..f43e526dde0e
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1-debug_info.c
@@ -0,0 +1,5 @@
+/* Test "acc_prof-init-1.c" with debug info available.  */
+
+/* { dg-additional-options "-g -DDEBUG_INFO=1" }  */
+
+#include "acc_prof-init-1.c"
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
new file mode 100644
index 000000000000..ec5bcf6a90e5
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
@@ -0,0 +1,388 @@
+/* Test dispatch of events to callbacks.  */
+
+/* If not included from "acc_prof-parallel-1-debug_info.c".  */
+#ifndef DEBUG_INFO
+# define DEBUG_INFO 0
+#endif
+
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+   which will cause the value at the point of call to be used (*before* any
+   potential modifications done in callbacks), as opposed to its address being
+   taken, which then later gets dereferenced (*after* any modifications done in
+   callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+#define STATE_OP(state, op)\
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int acc_async;
+
+
+struct tool_info
+{
+  acc_event_info event_info;
+  struct tool_info *nested;
+};
+struct tool_info *tool_info;
+
+
+static const char *acc_prof_src_file = NULL;
+static const char *acc_prof_func_name = NULL;
+static int acc_prof_line_no = -1;
+
+static void
+set_locinfo (const char *src_file, const char *func_name, int line_no)
+{
+  assert (acc_prof_src_file == NULL);
+  acc_prof_src_file = src_file;
+  assert (acc_prof_func_name == NULL);
+  acc_prof_func_name = func_name;
+  assert (acc_prof_line_no == -1);
+  acc_prof_line_no = line_no;
+}
+
+static void
+unset_locinfo ()
+{
+  assert (acc_prof_src_file != NULL);
+  acc_prof_src_file = NULL;
+  assert (acc_prof_func_name != NULL);
+  acc_prof_func_name = NULL;
+  assert (acc_prof_line_no != -1);
+  acc_prof_line_no = -1;
+}
+
+static void
+verify_locinfo (const acc_prof_info *prof_info)
+{
+  DEBUG_printf ("  acc_prof_src_file: '%s'\n", acc_prof_src_file ?: "NULL");
+  DEBUG_printf ("  prof_info->src_file: '%s'\n", prof_info->src_file ?: "NULL");
+  DEBUG_printf ("  acc_prof_func_name: '%s'\n", acc_prof_func_name ?: "NULL");
+  DEBUG_printf ("  prof_info->func_name: '%s'\n", prof_info->func_name ?: "NULL");
+  DEBUG_printf ("  acc_prof_line_no: '%d'\n", acc_prof_line_no);
+  DEBUG_printf ("  prof_info->line_no: '%d'\n", prof_info->line_no);
+
+  assert (acc_prof_src_file != NULL);
+  assert (acc_prof_func_name != NULL);
+  assert (acc_prof_line_no != -1);
+#if DEBUG_INFO
+  assert (prof_info->src_file != NULL);
+  assert (strcmp (prof_info->src_file, acc_prof_src_file) == 0);
+  assert (prof_info->func_name != NULL);
+  assert (strcmp (prof_info->func_name, acc_prof_func_name) == 0);
+  assert (prof_info->line_no == acc_prof_line_no);
+#else
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+#endif
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+}
+
+
+void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 0
+	  || state == 100);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_device_init_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  if (state == 1)
+    assert (prof_info->device_type == acc_device_host);
+  else
+    assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+#if DEBUG_INFO
+  //TODO verify_locinfo (prof_info);
+#else
+  verify_locinfo (prof_info);
+#endif
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+}
+
+void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 1
+	  || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
+
+  assert (prof_info->event_type == acc_ev_device_init_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  if (state == 2)
+    assert (prof_info->device_type == acc_device_host);
+  else
+    assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+#if DEBUG_INFO
+  //TODO verify_locinfo (prof_info);
+#else
+  verify_locinfo (prof_info);
+#endif
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_runtime_api);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 10
+	  || state == 110);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_compute_construct_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  verify_locinfo (prof_info);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+}
+
+void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 11
+	  || state == 111);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+
+  assert (prof_info->event_type == acc_ev_compute_construct_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  verify_locinfo (prof_info);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+  reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
+  assert (state == 0);
+
+  set_locinfo (__FILE__, __func__, __LINE__ + 1);
+  acc_init (acc_device_host);
+  unset_locinfo ();
+  assert (state == 2);
+
+  STATE_OP (state, = 10);
+
+  set_locinfo (__FILE__, __func__, __LINE__ + 1);
+  acc_device_type = acc_get_device_type ();
+  unset_locinfo ();
+  set_locinfo (__FILE__, __func__, __LINE__ + 1);
+  acc_device_num = acc_get_device_num (acc_device_type);
+  unset_locinfo ();
+  acc_async = 12;
+
+  {
+    int state_init;
+    set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    unset_locinfo ();
+    set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc wait
+    unset_locinfo ();
+    assert (state_init == 11);
+  }
+  assert (state == 12);
+
+  STATE_OP (state, = 90);
+  set_locinfo (__FILE__, __func__, __LINE__ + 1);
+  acc_shutdown (acc_device_host);
+  unset_locinfo ();
+  assert (state == 90);
+
+
+  STATE_OP (state, = 100);
+  set_locinfo (__FILE__, __func__, __LINE__ + 1);
+  acc_init (acc_device_default);
+  unset_locinfo ();
+  assert (state == 102);
+
+  STATE_OP (state, = 110);
+
+  set_locinfo (__FILE__, __func__, __LINE__ + 1);
+  acc_device_type = acc_get_device_type ();
+  unset_locinfo ();
+  set_locinfo (__FILE__, __func__, __LINE__ + 1);
+  acc_device_num = acc_get_device_num (acc_device_type);
+  unset_locinfo ();
+  acc_async = 12;
+
+  {
+    int state_init;
+    set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    unset_locinfo ();
+    set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc wait
+    unset_locinfo ();
+    assert (state_init == 111);
+  }
+  assert (state == 112);
+
+  STATE_OP (state, = 190);
+  set_locinfo (__FILE__, __func__, __LINE__ + 1);
+  acc_shutdown (acc_device_default);
+  unset_locinfo ();
+  assert (state == 190);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
new file mode 100644
index 000000000000..28cea048a1aa
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
@@ -0,0 +1,252 @@
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+   which will cause the value at the point of call to be used (*before* any
+   potential modifications done in callbacks), as opposed to its address being
+   taken, which then later gets dereferenced (*after* any modifications done in
+   callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+/* See the "DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT" reference in
+   libgomp.texi.  */
+#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+#define STATE_OP(state, op)\
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int num_gangs, num_workers, vector_length;
+
+
+void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (state == 0);
+  STATE_OP (state, = 1);
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.tool_info == NULL);
+  assert (event_info->launch_event.kernel_name != NULL);
+  {
+    const char *s = strstr (event_info->launch_event.kernel_name, "main");
+    assert (s != NULL);
+    s = strstr (s, "omp_fn");
+    assert (s != NULL);
+  }
+  if (num_gangs < 1)
+    assert (event_info->launch_event.num_gangs >= 1);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.num_gangs == num_gangs);
+#else
+      /* No parallelized OpenACC kernels constructs, and unparallelized OpenACC
+	 kernels constructs must get launched as 1 x 1 x 1 kernels.  */
+      assert (event_info->launch_event.num_gangs == 1);
+#endif
+    }
+  if (num_workers < 1)
+    assert (event_info->launch_event.num_workers >= 1);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.num_workers == num_workers);
+#else
+      /* See num_gangs above.  */
+      assert (event_info->launch_event.num_workers == 1);
+#endif
+    }
+  if (vector_length < 1)
+    assert (event_info->launch_event.vector_length >= 1);
+  else if (acc_device_type == acc_device_nvidia) /* ... is special.  */
+    assert (event_info->launch_event.vector_length == 32);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.vector_length == vector_length);
+#else
+      /* See num_gangs above.  */
+      assert (event_info->launch_event.vector_length == 1);
+#endif
+    }
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
+  assert (state == 0);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  assert (state == 0);
+
+  /* Parallelism dimensions: compiler/runtime decides.  */
+  STATE_OP (state, = 0);
+  num_gangs = num_workers = vector_length = 0;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels
+    {
+      for (int i = 0; i < N; ++i)
+	x[i] = i * i;
+    }
+#ifdef __OPTIMIZE__
+    /* TODO.  With -O2 optimizations enabled, the compiler believes that here
+       "state == 0" still holds.  It's not yet clear what's going on.
+       Mis-optimization across the GOMP function call boundary?  Per its
+       gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+       "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+       must expect calls back into this compilation unit?  */
+    asm volatile ("" : : : "memory");
+#endif
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No acc_ev_enqueue_launch_start.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+	__builtin_abort ();
+#undef N
+  }
+
+  /* Parallelism dimensions: literal.  */
+  STATE_OP (state, = 0);
+  num_gangs = 30;
+  num_workers = 3;
+  vector_length = 5;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels \
+  num_gangs (30) num_workers (3) vector_length (5)
+    /* { dg-prune-output "using vector_length \\(32\\), ignoring 5" } */
+    {
+      for (int i = 0; i < N; ++i)
+	x[i] = i * i;
+    }
+#ifdef __OPTIMIZE__
+    /* TODO.  With -O2 optimizations enabled, the compiler believes that here
+       "state == 0" still holds.  It's not yet clear what's going on.
+       Mis-optimization across the GOMP function call boundary?  Per its
+       gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+       "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+       must expect calls back into this compilation unit?  */
+    asm volatile ("" : : : "memory");
+#endif
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No acc_ev_enqueue_launch_start.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+	__builtin_abort ();
+#undef N
+  }
+
+  /* Parallelism dimensions: variable.  */
+  STATE_OP (state, = 0);
+  num_gangs = 22;
+  num_workers = 5;
+  vector_length = 7;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels \
+  num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length)
+    /* { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" } */
+    {
+      for (int i = 0; i < N; ++i)
+	x[i] = i * i;
+    }
+#ifdef __OPTIMIZE__
+    /* TODO.  With -O2 optimizations enabled, the compiler believes that here
+       "state == 0" still holds.  It's not yet clear what's going on.
+       Mis-optimization across the GOMP function call boundary?  Per its
+       gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+       "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+       must expect calls back into this compilation unit?  */
+    asm volatile ("" : : : "memory");
+#endif
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No acc_ev_enqueue_launch_start.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+	__builtin_abort ();
+#undef N
+  }
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1-debug_info.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1-debug_info.c
new file mode 100644
index 000000000000..19226449d932
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1-debug_info.c
@@ -0,0 +1,5 @@
+/* Test "acc_prof-parallel-1.c" with debug info available.  */
+
+/* { dg-additional-options "-g -DDEBUG_INFO=1" }  */
+
+#include "acc_prof-parallel-1.c"
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
new file mode 100644
index 000000000000..2b3cb3e16bc5
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
@@ -0,0 +1,737 @@
+/* Test dispatch of events to callbacks.  */
+
+/* If not included from "acc_prof-parallel-1-debug_info.c".  */
+#ifndef DEBUG_INFO
+# define DEBUG_INFO 0
+#endif
+
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+   which will cause the value at the point of call to be used (*before* any
+   potential modifications done in callbacks), as opposed to its address being
+   taken, which then later gets dereferenced (*after* any modifications done in
+   callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+
+/* See the "DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT" reference in
+   libgomp.texi.  */
+#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+static int state = -1;
+#define STATE_OP(state, op)\
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int acc_async;
+
+
+struct tool_info
+{
+  acc_event_info event_info;
+  struct tool_info *nested;
+};
+struct tool_info *tool_info;
+
+
+static const char *acc_prof_src_file = NULL;
+static const char *acc_prof_func_name = NULL;
+static int acc_prof_line_no = -1;
+
+static void
+set_locinfo (const char *src_file, const char *func_name, int line_no)
+{
+  assert (acc_prof_src_file == NULL);
+  acc_prof_src_file = src_file;
+  assert (acc_prof_func_name == NULL);
+  acc_prof_func_name = func_name;
+  assert (acc_prof_line_no == -1);
+  acc_prof_line_no = line_no;
+}
+
+static void
+unset_locinfo ()
+{
+  assert (acc_prof_src_file != NULL);
+  acc_prof_src_file = NULL;
+  assert (acc_prof_func_name != NULL);
+  acc_prof_func_name = NULL;
+  assert (acc_prof_line_no != -1);
+  acc_prof_line_no = -1;
+}
+
+static void
+verify_locinfo (const acc_prof_info *prof_info)
+{
+  DEBUG_printf ("  acc_prof_src_file: '%s'\n", acc_prof_src_file ?: "NULL");
+  DEBUG_printf ("  prof_info->src_file: '%s'\n", prof_info->src_file ?: "NULL");
+  DEBUG_printf ("  acc_prof_func_name: '%s'\n", acc_prof_func_name ?: "NULL");
+  DEBUG_printf ("  prof_info->func_name: '%s'\n", prof_info->func_name ?: "NULL");
+  DEBUG_printf ("  acc_prof_line_no: '%d'\n", acc_prof_line_no);
+  DEBUG_printf ("  prof_info->line_no: '%d'\n", prof_info->line_no);
+
+  assert (acc_prof_src_file != NULL);
+  assert (acc_prof_func_name != NULL);
+  assert (acc_prof_line_no != -1);
+#if DEBUG_INFO
+  assert (prof_info->src_file != NULL);
+  assert (strcmp (prof_info->src_file, acc_prof_src_file) == 0);
+  assert (prof_info->func_name != NULL);
+  assert (strcmp (prof_info->func_name, acc_prof_func_name) == 0);
+  assert (prof_info->line_no == acc_prof_line_no);
+#else
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+#endif
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+}
+
+
+void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 1
+	  || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+#else
+  assert (state == 0
+	  || state == 100);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+#endif
+
+  assert (prof_info->event_type == acc_ev_device_init_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+#if DEBUG_INFO
+  //TODO verify_locinfo (prof_info);
+#else
+  verify_locinfo (prof_info);
+#endif
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+#else
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+#endif
+}
+
+void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 2
+	  || state == 102);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == acc_ev_device_init_start);
+#else
+  assert (state == 1
+	  || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start);
+#endif
+
+  assert (prof_info->event_type == acc_ev_device_init_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+#if DEBUG_INFO
+  //TODO verify_locinfo (prof_info);
+#else
+  verify_locinfo (prof_info);
+#endif
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (event_info->other_event.tool_info == tool_info->nested);
+#else
+  assert (event_info->other_event.tool_info == tool_info);
+#endif
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+#else
+  free (tool_info);
+  tool_info = NULL;
+#endif
+}
+
+void cb_enter_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 3
+	  || state == 103);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_enter_data_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  verify_locinfo (prof_info);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+}
+
+void cb_enter_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 4
+	  || state == 104);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == acc_ev_enter_data_start);
+
+  assert (prof_info->event_type == acc_ev_enter_data_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  verify_locinfo (prof_info);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == tool_info->nested);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+}
+
+void cb_exit_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 7);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_exit_data_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  verify_locinfo (prof_info);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+}
+
+void cb_exit_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 8);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == acc_ev_exit_data_start);
+
+  assert (prof_info->event_type == acc_ev_exit_data_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  verify_locinfo (prof_info);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == tool_info->nested);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+}
+
+void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 0
+	  || state == 100);
+  if (state == 100)
+    {
+      /* Compensate for the missing acc_ev_device_init_start and
+	 acc_ev_device_init_end.  */
+      state += 2;
+    }
+#else
+  if (state == 100)
+    {
+      /* Compensate for the missing acc_ev_device_init_start and
+	 acc_ev_device_init_end.  */
+      state += 2;
+    }
+  assert (state == 2
+	  || state == 102);
+#endif
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_compute_construct_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  verify_locinfo (prof_info);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+
+  if (acc_device_type == acc_device_host)
+    {
+      /* Compensate for the missing acc_ev_enter_data_start.  */
+      state += 1;
+    }
+}
+
+void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  if (acc_device_type == acc_device_host)
+    {
+      /* Compensate for the missing acc_ev_enter_data_end.  */
+      state += 1;
+      /* Compensate for the missing acc_ev_enqueue_launch_start and
+	 acc_ev_enqueue_launch_end.  */
+      state += 2;
+      /* Compensate for the missing acc_ev_exit_data_start and
+	 acc_ev_exit_data_end.  */
+      state += 2;
+    }
+  else if (acc_async != acc_async_sync)
+    {
+      /* Compensate for the missing acc_ev_exit_data_start and
+	 acc_ev_exit_data_end.  */
+      state += 2;
+    }
+  assert (state == 9
+	  || state == 109);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+
+  assert (prof_info->event_type == acc_ev_compute_construct_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  verify_locinfo (prof_info);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (state == 5
+	  || state == 105);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  verify_locinfo (prof_info);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.tool_info == NULL);
+  assert (event_info->launch_event.kernel_name != NULL);
+  {
+    const char *s = strstr (event_info->launch_event.kernel_name, "main");
+    assert (s != NULL);
+    s = strstr (s, "omp_fn");
+    assert (s != NULL);
+  }
+  assert (event_info->launch_event.num_gangs >= 1);
+  assert (event_info->launch_event.num_workers >= 1);
+  assert (event_info->launch_event.vector_length >= 1);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->nested->event_info.launch_event.event_type = event_info->launch_event.event_type;
+  tool_info->nested->event_info.launch_event.kernel_name = strdup (event_info->launch_event.kernel_name);
+  tool_info->nested->event_info.launch_event.num_gangs = event_info->launch_event.num_gangs;
+  tool_info->nested->event_info.launch_event.num_workers = event_info->launch_event.num_workers;
+  tool_info->nested->event_info.launch_event.vector_length = event_info->launch_event.vector_length;
+  event_info->other_event.tool_info = tool_info->nested;
+}
+
+void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (state == 6
+	  || state == 106);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.launch_event.event_type == acc_ev_enqueue_launch_start);
+  assert (tool_info->nested->event_info.launch_event.kernel_name != NULL);
+  assert (tool_info->nested->event_info.launch_event.num_gangs >= 1);
+  assert (tool_info->nested->event_info.launch_event.num_workers >= 1);
+  assert (tool_info->nested->event_info.launch_event.vector_length >= 1);
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  verify_locinfo (prof_info);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.tool_info == tool_info->nested);
+  assert (event_info->launch_event.kernel_name != NULL);
+  assert (strcmp (event_info->launch_event.kernel_name, tool_info->nested->event_info.launch_event.kernel_name) == 0);
+  assert (event_info->launch_event.num_gangs == tool_info->nested->event_info.launch_event.num_gangs);
+  assert (event_info->launch_event.num_workers == tool_info->nested->event_info.launch_event.num_workers);
+  assert (event_info->launch_event.vector_length == tool_info->nested->event_info.launch_event.vector_length);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free ((void *) tool_info->nested->event_info.launch_event.kernel_name);
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+}
+
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+  reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+  reg (acc_ev_enter_data_start, cb_enter_data_start, acc_reg);
+  reg (acc_ev_enter_data_end, cb_enter_data_end, acc_reg);
+  reg (acc_ev_exit_data_start, cb_exit_data_start, acc_reg);
+  reg (acc_ev_exit_data_end, cb_exit_data_end, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
+  reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
+  reg (acc_ev_enqueue_launch_end, cb_enqueue_launch_end, acc_reg);
+  assert (state == 0);
+
+  set_locinfo (__FILE__, __func__, __LINE__ + 1);
+  acc_device_type = acc_get_device_type ();
+  unset_locinfo ();
+  set_locinfo (__FILE__, __func__, __LINE__ + 1);
+  acc_device_num = acc_get_device_num (acc_device_type);
+  unset_locinfo ();
+  acc_async = acc_async_sync;
+  assert (state == 0);
+
+  {
+    int state_init;
+    set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    unset_locinfo ();
+    assert (state_init == 4);
+  }
+#ifdef __OPTIMIZE__
+  /* TODO.  With -O2 optimizations enabled, the compiler believes that here
+     "state == 0" still holds.  It's not yet clear what's going on.
+     Mis-optimization across the GOMP function call boundary?  Per its
+     gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+     "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+     must expect calls back into this compilation unit?  */
+  asm volatile ("" : : : "memory");
+#endif
+  assert (state == 10);
+
+  STATE_OP (state, = 100);
+
+  acc_async = 12;
+  {
+    int state_init;
+    set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    unset_locinfo ();
+    set_locinfo (__FILE__, __func__, __LINE__ + 1);
+#pragma acc wait
+    unset_locinfo ();
+    assert (state_init == 104);
+  }
+  assert (state == 110);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
new file mode 100644
index 000000000000..e2bf38e26799
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
@@ -0,0 +1,178 @@
+/* Test the "valid_bytes" magic.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+void cb_data_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->data_event.valid_bytes == _ACC_DATA_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+}
+
+void cb_launch_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+}
+
+void cb_other_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+}
+
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg_ (acc_ev_device_init_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_init_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_runtime_shutdown, cb_other_event, acc_reg);
+  reg_ (acc_ev_create, cb_data_event, acc_reg);
+  reg_ (acc_ev_delete, cb_data_event, acc_reg);
+  reg_ (acc_ev_alloc, cb_data_event, acc_reg);
+  reg_ (acc_ev_free, cb_data_event, acc_reg);
+  reg_ (acc_ev_enter_data_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_enter_data_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_exit_data_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_exit_data_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_update_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_update_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_compute_construct_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_compute_construct_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_start, cb_launch_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_end, cb_launch_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_start, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_end, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_start, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_end, cb_data_event, acc_reg);
+  reg_ (acc_ev_wait_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_wait_end, cb_other_event, acc_reg);
+}
+
+
+/* Basic struct.  */
+typedef struct A
+{
+  int a;
+  int b;
+#define VALID_BYTES_A \
+  _ACC_PROF_VALID_BYTES_STRUCT (A, b, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (int))
+} A;
+
+/* Add a "char" field.  */
+typedef struct B
+{
+  int a;
+  int b;
+  char c;
+#define VALID_BYTES_B \
+  _ACC_PROF_VALID_BYTES_STRUCT (B, c, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} B;
+
+/* Add another "char" field.  */
+typedef struct C
+{
+  int a;
+  int b;
+  char c, d;
+#define VALID_BYTES_C \
+  _ACC_PROF_VALID_BYTES_STRUCT (C, d, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} C;
+
+/* Add two "void *" fields.  */
+typedef struct D
+{
+  int a;
+  int b;
+  char c, d;
+  void *e;
+  void *f;
+#define VALID_BYTES_D \
+  _ACC_PROF_VALID_BYTES_STRUCT (D, f, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} D;
+
+/* Add another three "char" fields.  */
+typedef struct E
+{
+  int a;
+  int b;
+  char c, d;
+  void *e;
+  void *f;
+  char g, h, i;
+#define VALID_BYTES_E \
+  _ACC_PROF_VALID_BYTES_STRUCT (E, i, \
+				_ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} E;
+
+
+int main()
+{
+  A A1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof A1, VALID_BYTES_A);
+  assert (VALID_BYTES_A <= sizeof A1);
+  DEBUG_printf ("&A1=%p, &A1.b=%p\n", &A1, &A1.b);
+  assert (((char *) &A1) + VALID_BYTES_A == (char *) (&A1.b + 1));
+
+  B B1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof B1, VALID_BYTES_B);
+  assert (VALID_BYTES_B <= sizeof B1);
+  DEBUG_printf ("&B1=%p, &B1.c=%p\n", &B1, &B1.c);
+  assert (((char *) &B1) + VALID_BYTES_B == (char *) (&B1.c + 1));
+
+  assert (VALID_BYTES_B == VALID_BYTES_A + 1 * sizeof (char));
+
+  C C1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof C1, VALID_BYTES_C);
+  assert (VALID_BYTES_C <= sizeof C1);
+  DEBUG_printf ("&C1=%p, &C1.d=%p\n", &C1, &C1.d);
+  assert (((char *) &C1) + VALID_BYTES_C == (char *) (&C1.d + 1));
+
+  assert (VALID_BYTES_C == VALID_BYTES_B + 1 * sizeof (char));
+
+  D D1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof D1, VALID_BYTES_D);
+  assert (VALID_BYTES_D <= sizeof D1);
+  DEBUG_printf ("&D1=%p, &D1.f=%p\n", &D1, &D1.f);
+  assert (((char *) &D1) + VALID_BYTES_D == (char *) (&D1.f + 1));
+
+  assert (VALID_BYTES_D > VALID_BYTES_C);
+
+  E E1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof E1, VALID_BYTES_E);
+  assert (VALID_BYTES_E <= sizeof E1);
+  DEBUG_printf ("&E1=%p, &E1.i=%p\n", &E1, &E1.i);
+  assert (((char *) &E1) + VALID_BYTES_E == (char *) (&E1.i + 1));
+
+  assert (VALID_BYTES_E == VALID_BYTES_D + 3 * sizeof (char));
+
+  /* Trigger tests done in cb_* functions.  */
+#pragma acc parallel
+  {
+  }
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c
new file mode 100644
index 000000000000..7c619a09689f
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c
@@ -0,0 +1,60 @@
+/* Test "acc_prof_info"'s  "version" field.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+
+void cb_any_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->version == 201510);
+}
+
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg_ (acc_ev_device_init_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_init_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_runtime_shutdown, cb_any_event, acc_reg);
+  reg_ (acc_ev_create, cb_any_event, acc_reg);
+  reg_ (acc_ev_delete, cb_any_event, acc_reg);
+  reg_ (acc_ev_alloc, cb_any_event, acc_reg);
+  reg_ (acc_ev_free, cb_any_event, acc_reg);
+  reg_ (acc_ev_enter_data_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enter_data_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_exit_data_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_exit_data_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_update_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_update_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_compute_construct_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_compute_construct_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_wait_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_wait_end, cb_any_event, acc_reg);
+}
+
+
+int main()
+{
+  /* Trigger tests done in cb_* functions.  */
+#pragma acc parallel
+  {
+  }
+
+  return 0;
+}


Grüße
 Thomas


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]