Next set of OpenACC changes: Testsuite

Thomas Schwinge thomas@codesourcery.com
Tue May 5 09:00:00 GMT 2015


Hi!

On Tue, 05 May 2015 10:54:02 +0200, I wrote:
> In follow-up messages, I'll be posting the separated parts (for easier
> review) of a next set of OpenACC changes that we'd like to commit.
> ChangeLog updates not yet written; will do that before commit, obviously.

 gcc/testsuite/c-c++-common/goacc-gomp/nesting-1.c  |   46 +
 .../c-c++-common/goacc-gomp/nesting-fail-1.c       |   25 -
 gcc/testsuite/c-c++-common/goacc/asyncwait-1.c     |    4 +-
 gcc/testsuite/c-c++-common/goacc/data-2.c          |   12 +-
 gcc/testsuite/c-c++-common/goacc/declare-1.c       |   84 +
 gcc/testsuite/c-c++-common/goacc/declare-2.c       |   67 +
 gcc/testsuite/c-c++-common/goacc/dtype-1.c         |  113 ++
 gcc/testsuite/c-c++-common/goacc/dtype-2.c         |   31 +
 gcc/testsuite/c-c++-common/goacc/host_data-1.c     |   14 +
 gcc/testsuite/c-c++-common/goacc/host_data-2.c     |   14 +
 gcc/testsuite/c-c++-common/goacc/host_data-3.c     |   16 +
 gcc/testsuite/c-c++-common/goacc/host_data-4.c     |   15 +
 gcc/testsuite/c-c++-common/goacc/kernels-1.c       |    6 -
 gcc/testsuite/c-c++-common/goacc/kernels-empty.c   |    6 +
 gcc/testsuite/c-c++-common/goacc/kernels-eternal.c |   11 +
 .../c-c++-common/goacc/kernels-noreturn.c          |   12 +
 gcc/testsuite/c-c++-common/goacc/loop-1.c          |    2 -
 gcc/testsuite/c-c++-common/goacc/parallel-1.c      |    6 -
 gcc/testsuite/c-c++-common/goacc/parallel-empty.c  |    6 +
 .../c-c++-common/goacc/parallel-eternal.c          |   11 +
 .../c-c++-common/goacc/parallel-noreturn.c         |   12 +
 gcc/testsuite/c-c++-common/goacc/reduction-1.c     |   25 +-
 gcc/testsuite/c-c++-common/goacc/reduction-2.c     |   22 +-
 gcc/testsuite/c-c++-common/goacc/reduction-3.c     |   22 +-
 gcc/testsuite/c-c++-common/goacc/reduction-4.c     |   40 +-
 gcc/testsuite/c-c++-common/goacc/routine-1.c       |   35 +
 gcc/testsuite/c-c++-common/goacc/routine-2.c       |   36 +
 gcc/testsuite/c-c++-common/goacc/routine-3.c       |   52 +
 gcc/testsuite/c-c++-common/goacc/routine-4.c       |   87 ++
 gcc/testsuite/c-c++-common/goacc/tile.c            |   26 +
 gcc/testsuite/g++.dg/goacc/template-reduction.C    |  100 ++
 gcc/testsuite/g++.dg/goacc/template.C              |  131 ++
 gcc/testsuite/gfortran.dg/goacc/cache-1.f95        |    1 -
 gcc/testsuite/gfortran.dg/goacc/coarray.f95        |    2 +-
 gcc/testsuite/gfortran.dg/goacc/coarray_2.f90      |    1 +
 gcc/testsuite/gfortran.dg/goacc/combined_loop.f90  |    2 +-
 gcc/testsuite/gfortran.dg/goacc/cray.f95           |    1 -
 gcc/testsuite/gfortran.dg/goacc/declare-1.f95      |    3 +-
 gcc/testsuite/gfortran.dg/goacc/declare-2.f95      |   44 +
 gcc/testsuite/gfortran.dg/goacc/default.f95        |   17 +
 gcc/testsuite/gfortran.dg/goacc/dtype-1.f95        |  161 ++
 gcc/testsuite/gfortran.dg/goacc/dtype-2.f95        |   39 +
 gcc/testsuite/gfortran.dg/goacc/host_data-tree.f95 |    2 +-
 gcc/testsuite/gfortran.dg/goacc/loop-1.f95         |    1 -
 gcc/testsuite/gfortran.dg/goacc/loop-2.f95         |   26 +-
 gcc/testsuite/gfortran.dg/goacc/modules.f95        |   55 +
 gcc/testsuite/gfortran.dg/goacc/parameter.f95      |    1 -
 gcc/testsuite/gfortran.dg/goacc/update.f95         |    5 +
 libgomp/testsuite/
 .../libgomp.oacc-c++/template-reduction.C          |  102 ++
 .../libgomp.oacc-c-c++-common/atomic_capture-1.c   |  866 +++++++++++
 .../libgomp.oacc-c-c++-common/atomic_capture-2.c   | 1626 ++++++++++++++++++++
 .../libgomp.oacc-c-c++-common/atomic_update-1.c    |  760 +++++++++
 .../libgomp.oacc-c-c++-common/clauses-1.c          |   26 +
 .../testsuite/libgomp.oacc-c-c++-common/data-2.c   |   44 +-
 .../testsuite/libgomp.oacc-c-c++-common/data-3.c   |   18 +-
 .../libgomp.oacc-c-c++-common/data-clauses.h       |  202 +++
 .../libgomp.oacc-c-c++-common/kernels-1.c          |  182 +--
 .../testsuite/libgomp.oacc-c-c++-common/lib-69.c   |   70 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-70.c   |   79 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-71.c   |   55 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-72.c   |   60 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-73.c   |   64 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-74.c   |   91 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-75.c   |   89 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-76.c   |   88 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-77.c   |   91 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-78.c   |   91 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-79.c   |   91 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-80.c   |   95 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-81.c   |  106 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-82.c   |   43 +-
 .../testsuite/libgomp.oacc-c-c++-common/lib-83.c   |   22 +-
 .../libgomp.oacc-c-c++-common/parallel-1.c         |  204 +--
 .../libgomp.oacc-c-c++-common/routine-1.c          |   40 +
 .../libgomp.oacc-c-c++-common/routine-2.c          |   41 +
 libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h |   44 +-
 .../testsuite/libgomp.oacc-c-c++-common/subr.ptx   |  222 +--
 .../testsuite/libgomp.oacc-c-c++-common/timer.h    |  103 --
 .../libgomp.oacc-fortran/atomic_capture-1.f90      |  784 ++++++++++
 .../libgomp.oacc-fortran/atomic_update-1.f90       |  338 ++++
 libgomp/testsuite/libgomp.oacc-fortran/cache-1.f90 |   26 +
 .../testsuite/libgomp.oacc-fortran/clauses-1.f90   |  290 ++++
 libgomp/testsuite/libgomp.oacc-fortran/data-1.f90  |  231 ++-
 libgomp/testsuite/libgomp.oacc-fortran/data-2.f90  |   50 +
 libgomp/testsuite/libgomp.oacc-fortran/data-3.f90  |   34 +-
 .../testsuite/libgomp.oacc-fortran/data-4-2.f90    |   19 +-
 libgomp/testsuite/libgomp.oacc-fortran/data-4.f90  |   19 +-
 .../testsuite/libgomp.oacc-fortran/declare-1.f90   |  229 +++
 libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90  |   24 +
 libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90  |   28 +
 libgomp/testsuite/libgomp.oacc-fortran/lib-14.f90  |   79 +
 libgomp/testsuite/libgomp.oacc-fortran/lib-15.f90  |   52 +
 .../testsuite/libgomp.oacc-fortran/routine-5.f90   |   27 +

diff --git gcc/testsuite/c-c++-common/goacc-gomp/nesting-1.c gcc/testsuite/c-c++-common/goacc-gomp/nesting-1.c
index df45bcf..b38e181 100644
--- gcc/testsuite/c-c++-common/goacc-gomp/nesting-1.c
+++ gcc/testsuite/c-c++-common/goacc-gomp/nesting-1.c
@@ -1,4 +1,50 @@
 void
+f_acc_data (void)
+{
+#pragma acc data
+  {
+    int i;
+#pragma omp atomic write
+    i = 0;
+  }
+}
+
+void
+f_acc_kernels (void)
+{
+#pragma acc kernels
+  {
+    int i;
+#pragma omp atomic write
+    i = 0;
+  }
+}
+
+void
+f_acc_loop (void)
+{
+  int i;
+
+#pragma acc loop
+  for (i = 0; i < 2; ++i)
+    {
+#pragma omp atomic write
+      i = 0;
+    }
+}
+
+void
+f_acc_parallel (void)
+{
+#pragma acc parallel
+  {
+    int i;
+#pragma omp atomic write
+    i = 0;
+  }
+}
+
+void
 f_omp_parallel (void)
 {
 #pragma omp parallel
diff --git gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c
index 411fb5f..14c6aa6 100644
--- gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c
+++ gcc/testsuite/c-c++-common/goacc-gomp/nesting-fail-1.c
@@ -216,12 +216,6 @@ f_acc_parallel (void)
 
 #pragma acc parallel
   {
-#pragma omp atomic write
-    i = 0; /* { dg-error "non-OpenACC construct inside of OpenACC region" } */
-  }
-
-#pragma acc parallel
-  {
 #pragma omp ordered /* { dg-error "non-OpenACC construct inside of OpenACC region" } */
     ;
   }
@@ -286,12 +280,6 @@ f_acc_kernels (void)
 
 #pragma acc kernels
   {
-#pragma omp atomic write
-    i = 0; /* { dg-error "non-OpenACC construct inside of OpenACC region" } */
-  }
-
-#pragma acc kernels
-  {
 #pragma omp ordered /* { dg-error "non-OpenACC construct inside of OpenACC region" } */
     ;
   }
@@ -356,12 +344,6 @@ f_acc_data (void)
 
 #pragma acc data
   {
-#pragma omp atomic write
-    i = 0; /* { dg-error "non-OpenACC construct inside of OpenACC region" } */
-  }
-
-#pragma acc data
-  {
 #pragma omp ordered /* { dg-error "non-OpenACC construct inside of OpenACC region" } */
     ;
   }
@@ -434,13 +416,6 @@ f_acc_loop (void)
 #pragma acc loop
   for (i = 0; i < 2; ++i)
     {
-#pragma omp atomic write
-      i = 0; /* { dg-error "non-OpenACC construct inside of OpenACC region" } */
-    }
-
-#pragma acc loop
-  for (i = 0; i < 2; ++i)
-    {
 #pragma omp ordered /* { dg-error "non-OpenACC construct inside of OpenACC region" } */
       ;
     }
diff --git gcc/testsuite/c-c++-common/goacc/asyncwait-1.c gcc/testsuite/c-c++-common/goacc/asyncwait-1.c
index ccc0106..c6b81b1 100644
--- gcc/testsuite/c-c++-common/goacc/asyncwait-1.c
+++ gcc/testsuite/c-c++-common/goacc/asyncwait-1.c
@@ -116,7 +116,7 @@ f (int N, float *a, float *b)
     }
 
 #pragma acc parallel copyin (a[0:N]) copy (b[0:N]) wait (1 /* { dg-error "expected '\\\)' before end of line" } */
-    /* { dg-error "expected integer expression before '\\\)'" "" { target c++ } 118 } */
+    /* { dg-error "expected integer expression list before" "" { target c++ } 118 } */
     {
         for (ii = 0; ii < N; ii++)
             b[ii] = a[ii];
@@ -171,7 +171,7 @@ f (int N, float *a, float *b)
 #pragma acc wait (1,2,,) /* { dg-error "expected (primary-|)expression before" } */
 
 #pragma acc wait (1 /* { dg-error "expected '\\\)' before end of line" } */
-    /* { dg-error "expected integer expression before '\\\)'" "" { target c++ } 173 } */
+    /* { dg-error "expected integer expression list before" "" { target c++ } 173 } */
 
 #pragma acc wait (1,*) /* { dg-error "expected (primary-|)expression before" } */
 
diff --git gcc/testsuite/c-c++-common/goacc/data-2.c gcc/testsuite/c-c++-common/goacc/data-2.c
index a67d8a4..1043bf8a 100644
--- gcc/testsuite/c-c++-common/goacc/data-2.c
+++ gcc/testsuite/c-c++-common/goacc/data-2.c
@@ -10,12 +10,14 @@ foo (void)
 #pragma acc exit data delete (a) if (0)
 #pragma acc exit data copyout (b) if (a)
 #pragma acc exit data delete (b)
-#pragma acc enter /* { dg-error "expected 'data' in" } */
-#pragma acc exit /* { dg-error "expected 'data' in" } */
+#pragma acc enter /* { dg-error "expected 'data' after" } */
+#pragma acc exit /* { dg-error "expected 'data' after" } */
 #pragma acc enter data /* { dg-error "has no data movement clause" } */
-#pragma acc exit data /* { dg-error "has no data movement clause" } */
-#pragma acc enter Data /* { dg-error "invalid pragma before" } */
-#pragma acc exit copyout (b) /* { dg-error "invalid pragma before" } */
+#pragma acc exit data /* { dg-error "no data movement clause" } */
+#pragma acc enter Data /* { dg-error "expected 'data' after" } */
+#pragma acc exit copyout (b) /* { dg-error "expected 'data' after" } */
+#pragma acc enter for /* { dg-error "expected 'data' after" } */
+#pragma acc enter data2 /* { dg-error "expected 'data' after" } */
 }
 
 /* { dg-error "has no data movement clause" "" { target *-*-* } 8 } */
diff --git gcc/testsuite/c-c++-common/goacc/declare-1.c gcc/testsuite/c-c++-common/goacc/declare-1.c
new file mode 100644
index 0000000..cf50f02
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/declare-1.c
@@ -0,0 +1,84 @@
+/* Test valid uses of declare directive.  */
+/* { dg-do compile } */
+/* { dg-skip-if "not yet" { c++ } } */
+
+int v0;
+#pragma acc declare create(v0)
+
+int v1;
+#pragma acc declare copyin(v1)
+
+int *v2;
+#pragma acc declare deviceptr(v2)
+
+int v3;
+#pragma acc declare device_resident(v3)
+
+int v4;
+#pragma acc declare link(v4)
+
+int v5, v6, v7, v8;
+#pragma acc declare create(v5, v6) copyin(v7, v8)
+
+void
+f (void)
+{
+  int va0;
+#pragma acc declare create(va0)
+
+  int va1;
+#pragma acc declare copyin(va1)
+
+  int *va2;
+#pragma acc declare deviceptr(va2)
+
+  int va3;
+#pragma acc declare device_resident(va3)
+
+  extern int ve0;
+#pragma acc declare create(ve0)
+
+  extern int ve1;
+#pragma acc declare copyin(ve1)
+
+  extern int *ve2;
+#pragma acc declare deviceptr(ve2)
+
+  extern int ve3;
+#pragma acc declare device_resident(ve3)
+
+  extern int ve4;
+#pragma acc declare link(ve4)
+
+  int va5;
+#pragma acc declare copy(va5)
+
+  int va6;
+#pragma acc declare copyout(va6)
+
+  int va7;
+#pragma acc declare present(va7)
+
+  int va8;
+#pragma acc declare present_or_copy(va8)
+
+  int va9;
+#pragma acc declare present_or_copyin(va9)
+
+  int va10;
+#pragma acc declare present_or_copyout(va10)
+
+  int va11;
+#pragma acc declare present_or_create(va11)
+
+ a:
+  {
+    int va0;
+#pragma acc declare create(va0)
+    if (v1)
+      goto a;
+    else
+      goto b;
+  }
+ b:;
+}
diff --git gcc/testsuite/c-c++-common/goacc/declare-2.c gcc/testsuite/c-c++-common/goacc/declare-2.c
new file mode 100644
index 0000000..a2b5d6f
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/declare-2.c
@@ -0,0 +1,67 @@
+/* Test invalid uses of declare directive.  */
+/* { dg-do compile } */
+/* { dg-skip-if "not yet" { c++ } } */
+
+#pragma acc declare /* { dg-error "no valid clauses" } */
+
+#pragma acc declare create(undeclared) /* { dg-error "undeclared" } */
+/* { dg-error "no valid clauses" "second error" { target *-*-* } 7 } */
+
+int v0[10];
+#pragma acc declare create(v0[1:3]) /* { dg-error "subarray" } */
+
+int v1;
+#pragma acc declare create(v1, v1) /* { dg-error "more than once" } */
+
+int v2;
+#pragma acc declare create(v2) /* { dg-message "previous directive" } */
+#pragma acc declare copyin(v2) /* { dg-error "more than once" } */
+
+int v3;
+#pragma acc declare copy(v3) /* { dg-error "at file scope" } */
+
+int v4;
+#pragma acc declare copyout(v4) /* { dg-error "at file scope" } */
+
+int v5;
+#pragma acc declare present(v5) /* { dg-error "at file scope" } */
+
+int v6;
+#pragma acc declare present_or_copy(v6) /* { dg-error "at file scope" } */
+
+int v7;
+#pragma acc declare present_or_copyin(v7) /* { dg-error "at file scope" } */
+
+int v8;
+#pragma acc declare present_or_copyout(v8) /* { dg-error "at file scope" } */
+
+int v9;
+#pragma acc declare present_or_create(v9) /* { dg-error "at file scope" } */
+
+void
+f (void)
+{
+  int va0;
+#pragma acc declare link(va0) /* { dg-error "invalid variable" } */
+
+  extern int ve0;
+#pragma acc declare copy(ve0) /* { dg-error "invalid use of" } */
+
+  extern int ve1;
+#pragma acc declare copyout(ve1) /* { dg-error "invalid use of" } */
+
+  extern int ve2;
+#pragma acc declare present(ve2) /* { dg-error "invalid use of" } */
+
+  extern int ve3;
+#pragma acc declare present_or_copy(ve3) /* { dg-error "invalid use of" } */
+
+  extern int ve4;
+#pragma acc declare present_or_copyin(ve4) /* { dg-error "invalid use of" } */
+
+  extern int ve5;
+#pragma acc declare present_or_copyout(ve5) /* { dg-error "invalid use of" } */
+
+  extern int ve6;
+#pragma acc declare present_or_create(ve6) /* { dg-error "invalid use of" } */
+}
diff --git gcc/testsuite/c-c++-common/goacc/dtype-1.c gcc/testsuite/c-c++-common/goacc/dtype-1.c
new file mode 100644
index 0000000..2b4569e
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/dtype-1.c
@@ -0,0 +1,113 @@
+/* { dg-do compile } */
+/* { dg-options "-fopenacc -fdump-tree-omplower" } */
+
+void
+test ()
+{
+  int i1;
+
+  /* ACC PARALLEL DEVICE_TYPE: */
+
+#pragma acc parallel device_type (nVidia) async (1) num_gangs (100) num_workers (100) vector_length (32) wait (1)
+  {
+  }
+
+#pragma acc parallel async (1) num_gangs (1) num_workers (1) vector_length (1) wait (1) dtype (nvidia) async (2) num_gangs (200) num_workers (200) vector_length (64) wait (2)
+  {
+  }
+
+#pragma acc parallel async (1) num_gangs (1) num_workers (1) vector_length (1) wait (1) dtype (nvidia) async (3) num_gangs (300) num_workers (300) vector_length (128) wait (3) device_type (*) async (10) num_gangs (10) num_workers (10) vector_length (10) wait (10)
+  {
+  }
+
+#pragma acc parallel async (1) num_gangs (1) num_workers (1) vector_length (1) wait (1) device_type (nvidia_ptx) async (3) num_gangs (300) num_workers (300) vector_length (128) wait (3) dtype (*) async (10) num_gangs (10) num_workers (10) vector_length (10) wait (10)
+  {
+  }
+
+  /* ACC KERNELS DEVICE_TYPE: */
+
+#pragma acc kernels device_type (nvidia) async wait
+  {
+  }
+
+#pragma acc kernels async wait dtype (nvidia) async (1) wait (1)
+  {
+  }
+
+#pragma acc kernels async wait dtype (nvidia) async (2) wait (2) device_type (*) async (0) wait (0)
+  {
+  }
+
+#pragma acc kernels async wait device_type (nvidia_ptx) async (1) wait (1) dtype (*) async (0) wait (0)
+  {
+  }
+
+  /* ACC LOOP DEVICE_TYPE: */
+
+#pragma acc parallel
+#pragma acc loop dtype (nVidia) gang
+  for (i1 = 1; i1 < 10; i1++)
+    {
+    }
+
+#pragma acc parallel
+#pragma acc loop device_type (nVidia) gang dtype (*) worker
+  for (i1 = 1; i1 < 10; i1++)
+    {
+    }
+
+#pragma acc parallel
+#pragma acc loop dtype (nVidiaGPU) gang device_type (*) vector
+  for (i1 = 1; i1 < 10; i1++)
+    {
+    }
+
+  /* ACC UPDATE DEVICE_TYPE: */
+
+#pragma acc update host(i1) async(1) wait (1)
+
+#pragma acc update host(i1) device_type(nvidia) async(2) wait (2)
+
+#pragma acc update host(i1) async(1) wait (1) device_type(nvidia) async(3) wait (3)
+
+#pragma acc update host(i1) async(4) wait (4) device_type(nvidia) async(5) wait (5) dtype (*) async (6) wait (6)
+
+#pragma acc update host(i1) async(4) wait (4) dtype(nvidia1) async(5) wait (5) dtype (*) async (6) wait (6)
+}
+
+/* ACC ROUTINE DEVICE_TYPE: */
+
+#pragma acc routine (foo1) device_type (nvidia) gang
+#pragma acc routine (foo2) device_type (nvidia) worker
+#pragma acc routine (foo3) dtype (nvidia) vector
+#pragma acc routine (foo5) device_type (nvidia) bind (foo)
+#pragma acc routine (foo6) device_type (nvidia) gang device_type (*) worker
+#pragma acc routine (foo7) dtype (nvidia) worker dtype (*) vector
+#pragma acc routine (foo8) dtype (nvidia) vector device_type (*) gang
+#pragma acc routine (foo9) device_type (nvidia) vector device_type (*) worker
+#pragma acc routine (foo10) device_type (nvidia) bind (foo) dtype (*) gang
+#pragma acc routine (foo11) device_type (gpu) gang device_type (*) worker
+#pragma acc routine (foo12) device_type (gpu) worker dtype (*) worker
+#pragma acc routine (foo13) device_type (gpu) vector device_type (*) worker
+#pragma acc routine (foo14) dtype (gpu) worker dtype (*) worker
+#pragma acc routine (foo15) dtype (gpu) bind (foo) dtype (*) gang
+
+/* { dg-final { scan-tree-dump-times "oacc_parallel wait\\(1\\) vector_length\\(32\\) num_workers\\(100\\) num_gangs\\(100\\) async\\(1\\)" 1 "omplower" } } */
+
+/* { dg-final { scan-tree-dump-times "oacc_parallel wait\\(1\\) vector_length\\(1\\) num_workers\\(1\\) num_gangs\\(1\\) async\\(1\\) wait\\(2\\) vector_length\\(64\\) num_workers\\(200\\) num_gangs\\(200\\) async\\(2\\)" 1 "omplower" } } */
+
+/* { dg-final { scan-tree-dump-times "acc_parallel wait\\(1\\) vector_length\\(1\\) num_workers\\(1\\) num_gangs\\(1\\) async\\(1\\) wait\\(3\\) vector_length\\(128\\) num_workers\\(300\\) num_gangs\\(300\\) async\\(3" 1 "omplower" } } */
+
+/* { dg-final { scan-tree-dump-times "oacc_kernels async\\(-1\\)" 4 "omplower" } } */
+
+/* { dg-final { scan-tree-dump-times "oacc_kernels async\\(-1\\) wait\\(2\\) async\\(2\\)" 1 "omplower" } } */
+
+/* { dg-final { scan-tree-dump-times "oacc_kernels async\\(-1\\) wait\\(0\\) async\\(0\\)" 1 "omplower" } } */
+
+/* { dg-final { scan-tree-dump-times "acc loop gang private\\(i1.0\\) private\\(i1\\)" 1 "omplower" } } */
+
+/* { dg-final { scan-tree-dump-times "acc loop gang private\\(i1.1\\) private\\(i1\\)" 1 "omplower" } } */
+
+/* { dg-final { scan-tree-dump-times "acc loop vector private\\(i1.2\\) private\\(i1\\)" 1 "omplower" } } */
+
+/* { dg-final { cleanup-tree-dump "omplower" } } */
diff --git gcc/testsuite/c-c++-common/goacc/dtype-2.c gcc/testsuite/c-c++-common/goacc/dtype-2.c
new file mode 100644
index 0000000..b0bd247
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/dtype-2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+
+void
+test ()
+{
+  int i1, i2;
+
+  /* ACC PARALLEL DEVICE_TYPE: */
+
+#pragma acc parallel dtype (nVidia) async (1) num_gangs (100) num_workers (100) vector_length (32) wait (1) copy (i1) /* { dg-error "not valid" } */
+  {
+  }
+
+  /* ACC KERNELS DEVICE_TYPE: */
+
+#pragma acc kernels device_type (nvidia) async wait copy (i1) /* { dg-error "not valid" } */
+  {
+  }
+
+  /* ACC LOOP DEVICE_TYPE: */
+
+#pragma acc parallel
+#pragma acc loop device_type (nVidia) gang private (i2) /* { dg-error "not valid" } */
+  for (i1 = 1; i1 < 10; i1++)
+    {
+    }
+
+  /* ACC UPDATE DEVICE_TYPE: */
+
+#pragma acc update host(i1) dtype (nvidia) async(1) wait (1) self (i2) /* { dg-error "not valid" } */
+}
diff --git gcc/testsuite/c-c++-common/goacc/host_data-1.c gcc/testsuite/c-c++-common/goacc/host_data-1.c
new file mode 100644
index 0000000..5e8240f
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/host_data-1.c
@@ -0,0 +1,14 @@
+/* Test valid use of host_data directive.  */
+/* { dg-do compile } */
+
+int v0;
+int v1[3][3];
+
+void
+f (void)
+{
+  int v2 = 3;
+#pragma acc host_data use_device(v2, v0, v1)
+  ;
+}
+/* { dg-bogus "sorry, unimplemented: directive not yet implemented" "host_data" { xfail *-*-* } 11 } */
diff --git gcc/testsuite/c-c++-common/goacc/host_data-2.c gcc/testsuite/c-c++-common/goacc/host_data-2.c
new file mode 100644
index 0000000..92fa97b
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/host_data-2.c
@@ -0,0 +1,14 @@
+/* Test invalid use of host_data directive.  */
+/* { dg-do compile } */
+
+int v0;
+#pragma acc host_data use_device(v0) /* { dg-error "expected" } */
+
+void
+f (void)
+{
+  int v2 = 3;
+#pragma acc host_data copy(v2) /* { dg-error "not valid for" } */
+  ;
+}
+/* { dg-bogus "sorry, unimplemented: directive not yet implemented" "host_data" { xfail *-*-* } 11 } */
diff --git gcc/testsuite/c-c++-common/goacc/host_data-3.c gcc/testsuite/c-c++-common/goacc/host_data-3.c
new file mode 100644
index 0000000..580f566
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/host_data-3.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+
+int main (int argc, char* argv[])
+{
+  int x = 5, y;
+
+  #pragma acc enter data copyin (x)
+  #pragma acc host_data use_device (x)
+  {
+    y = x;
+  }
+  #pragma acc exit data delete (x)
+
+  return y - 5;
+}
+/* { dg-bogus "sorry, unimplemented: directive not yet implemented" "host_data" { xfail *-*-* } 8 } */
diff --git gcc/testsuite/c-c++-common/goacc/host_data-4.c gcc/testsuite/c-c++-common/goacc/host_data-4.c
new file mode 100644
index 0000000..61b1c5b
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/host_data-4.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+int main (int argc, char* argv[])
+{
+  int x[100];
+
+  #pragma acc enter data copyin (x)
+  /* Specifying an array index is not valid for host_data/use_device.  */
+  #pragma acc host_data use_device (x[4]) /* { dg-error "expected \\\')' before '\\\[' token" } */
+    ;
+  #pragma acc exit data delete (x)
+
+  return 0;
+}
+/* { dg-bogus "sorry, unimplemented: directive not yet implemented" "host_data" { xfail *-*-* } 9 } */
diff --git gcc/testsuite/c-c++-common/goacc/kernels-1.c gcc/testsuite/c-c++-common/goacc/kernels-1.c
deleted file mode 100644
index e91b81c..0000000
--- gcc/testsuite/c-c++-common/goacc/kernels-1.c
+++ /dev/null
@@ -1,6 +0,0 @@
-void
-foo (void)
-{
-#pragma acc kernels
-  ;
-}
diff --git gcc/testsuite/c-c++-common/goacc/kernels-empty.c gcc/testsuite/c-c++-common/goacc/kernels-empty.c
new file mode 100644
index 0000000..e91b81c
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-empty.c
@@ -0,0 +1,6 @@
+void
+foo (void)
+{
+#pragma acc kernels
+  ;
+}
diff --git gcc/testsuite/c-c++-common/goacc/kernels-eternal.c gcc/testsuite/c-c++-common/goacc/kernels-eternal.c
new file mode 100644
index 0000000..edc17d2
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-eternal.c
@@ -0,0 +1,11 @@
+int
+main (void)
+{
+#pragma acc kernels
+  {
+    while (1)
+      ;
+  }
+
+  return 0;
+}
diff --git gcc/testsuite/c-c++-common/goacc/kernels-noreturn.c gcc/testsuite/c-c++-common/goacc/kernels-noreturn.c
new file mode 100644
index 0000000..1a8cc67
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/kernels-noreturn.c
@@ -0,0 +1,12 @@
+int
+main (void)
+{
+
+#pragma acc kernels
+  {
+    __builtin_abort ();
+  }
+
+  return 0;
+}
+
diff --git gcc/testsuite/c-c++-common/goacc/loop-1.c gcc/testsuite/c-c++-common/goacc/loop-1.c
index fea40e0..5e1a248 100644
--- gcc/testsuite/c-c++-common/goacc/loop-1.c
+++ gcc/testsuite/c-c++-common/goacc/loop-1.c
@@ -1,5 +1,3 @@
-/* { dg-skip-if "not yet" { c++ } } */
-
 int test1()
 {
   int i, j, k, b[10];
diff --git gcc/testsuite/c-c++-common/goacc/parallel-1.c gcc/testsuite/c-c++-common/goacc/parallel-1.c
deleted file mode 100644
index a860526..0000000
--- gcc/testsuite/c-c++-common/goacc/parallel-1.c
+++ /dev/null
@@ -1,6 +0,0 @@
-void
-foo (void)
-{
-#pragma acc parallel
-  ;
-}
diff --git gcc/testsuite/c-c++-common/goacc/parallel-empty.c gcc/testsuite/c-c++-common/goacc/parallel-empty.c
new file mode 100644
index 0000000..a860526
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/parallel-empty.c
@@ -0,0 +1,6 @@
+void
+foo (void)
+{
+#pragma acc parallel
+  ;
+}
diff --git gcc/testsuite/c-c++-common/goacc/parallel-eternal.c gcc/testsuite/c-c++-common/goacc/parallel-eternal.c
new file mode 100644
index 0000000..51eac76
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/parallel-eternal.c
@@ -0,0 +1,11 @@
+int
+main (void)
+{
+#pragma acc parallel
+  {
+    while (1)
+      ;
+  }
+
+  return 0;
+}
diff --git gcc/testsuite/c-c++-common/goacc/parallel-noreturn.c gcc/testsuite/c-c++-common/goacc/parallel-noreturn.c
new file mode 100644
index 0000000..ec840bd
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/parallel-noreturn.c
@@ -0,0 +1,12 @@
+int
+main (void)
+{
+
+#pragma acc parallel
+  {
+    __builtin_abort ();
+  }
+
+  return 0;
+}
+
diff --git gcc/testsuite/c-c++-common/goacc/reduction-1.c gcc/testsuite/c-c++-common/goacc/reduction-1.c
index 0f50082..8f7c70d 100644
--- gcc/testsuite/c-c++-common/goacc/reduction-1.c
+++ gcc/testsuite/c-c++-common/goacc/reduction-1.c
@@ -22,20 +22,17 @@ main(void)
   for (i = 0; i < n; i++)
     result *= array[i];
 
-//   result = 0;
-//   vresult = 0;
-// 
-//   /* 'max' reductions.  */
-// #pragma acc parallel vector_length (vl)
-// #pragma acc loop reduction (+:result)
-//   for (i = 0; i < n; i++)
-//       result = result > array[i] ? result : array[i];
-//
-//   /* 'min' reductions.  */
-// #pragma acc parallel vector_length (vl)
-// #pragma acc loop reduction (+:result)
-//   for (i = 0; i < n; i++)
-//       result = result < array[i] ? result : array[i];
+  /* 'max' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (max:result)
+  for (i = 0; i < n; i++)
+    result = result > array[i] ? result : array[i];
+
+  /* 'min' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (min:result)
+  for (i = 0; i < n; i++)
+    result = result < array[i] ? result : array[i];
 
   /* '&' reductions.  */
 #pragma acc parallel vector_length (vl)
diff --git gcc/testsuite/c-c++-common/goacc/reduction-2.c gcc/testsuite/c-c++-common/goacc/reduction-2.c
index 1f95138..7ff125f 100644
--- gcc/testsuite/c-c++-common/goacc/reduction-2.c
+++ gcc/testsuite/c-c++-common/goacc/reduction-2.c
@@ -22,17 +22,17 @@ main(void)
   for (i = 0; i < n; i++)
     result *= array[i];
 
-//   /* 'max' reductions.  */
-// #pragma acc parallel vector_length (vl)
-// #pragma acc loop reduction (+:result)
-//   for (i = 0; i < n; i++)
-//       result = result > array[i] ? result : array[i];
-// 
-//   /* 'min' reductions.  */
-// #pragma acc parallel vector_length (vl)
-// #pragma acc loop reduction (+:result)
-//   for (i = 0; i < n; i++)
-//       result = result < array[i] ? result : array[i];
+  /* 'max' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (max:result)
+  for (i = 0; i < n; i++)
+    result = result > array[i] ? result : array[i];
+
+  /* 'min' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (min:result)
+  for (i = 0; i < n; i++)
+    result = result < array[i] ? result : array[i];
 
   /* '&&' reductions.  */
 #pragma acc parallel vector_length (vl)
diff --git gcc/testsuite/c-c++-common/goacc/reduction-3.c gcc/testsuite/c-c++-common/goacc/reduction-3.c
index 476e375..cd44559 100644
--- gcc/testsuite/c-c++-common/goacc/reduction-3.c
+++ gcc/testsuite/c-c++-common/goacc/reduction-3.c
@@ -22,17 +22,17 @@ main(void)
   for (i = 0; i < n; i++)
     result *= array[i];
 
-//   /* 'max' reductions.  */
-// #pragma acc parallel vector_length (vl)
-// #pragma acc loop reduction (+:result)
-//   for (i = 0; i < n; i++)
-//       result = result > array[i] ? result : array[i];
-// 
-//   /* 'min' reductions.  */
-// #pragma acc parallel vector_length (vl)
-// #pragma acc loop reduction (+:result)
-//   for (i = 0; i < n; i++)
-//       result = result < array[i] ? result : array[i];
+  /* 'max' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (max:result)
+  for (i = 0; i < n; i++)
+    result = result > array[i] ? result : array[i];
+
+  /* 'min' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (min:result)
+  for (i = 0; i < n; i++)
+    result = result < array[i] ? result : array[i];
 
   /* '&&' reductions.  */
 #pragma acc parallel vector_length (vl)
diff --git gcc/testsuite/c-c++-common/goacc/reduction-4.c gcc/testsuite/c-c++-common/goacc/reduction-4.c
index 73dde86..ec3a9c9 100644
--- gcc/testsuite/c-c++-common/goacc/reduction-4.c
+++ gcc/testsuite/c-c++-common/goacc/reduction-4.c
@@ -16,25 +16,29 @@ main(void)
   for (i = 0; i < n; i++)
     result += array[i];
 
-  /* Needs support for complex multiplication.  */
+  /* '*' reductions.  */
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (*:result)
+  for (i = 0; i < n; i++)
+    result *= array[i];
 
-//   /* '*' reductions.  */
-// #pragma acc parallel vector_length (vl)
-// #pragma acc loop reduction (*:result)
-//   for (i = 0; i < n; i++)
-//     result *= array[i];
-//
-//   /* 'max' reductions.  */
-// #pragma acc parallel vector_length (vl)
-// #pragma acc loop reduction (+:result)
-//   for (i = 0; i < n; i++)
-//       result = result > array[i] ? result : array[i];
-// 
-//   /* 'min' reductions.  */
-// #pragma acc parallel vector_length (vl)
-// #pragma acc loop reduction (+:result)
-//   for (i = 0; i < n; i++)
-//       result = result < array[i] ? result : array[i];
+  /* 'max' reductions.  */
+#if 0
+  // error: 'result' has invalid type for 'reduction(max)'
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (max:result)
+  for (i = 0; i < n; i++)
+    result = result > array[i] ? result : array[i];
+#endif
+
+  /* 'min' reductions.  */
+#if 0
+  // error: 'result' has invalid type for 'reduction(min)'
+#pragma acc parallel vector_length (vl)
+#pragma acc loop reduction (min:result)
+  for (i = 0; i < n; i++)
+    result = result < array[i] ? result : array[i];
+#endif
 
   /* '&&' reductions.  */
 #pragma acc parallel vector_length (vl)
diff --git gcc/testsuite/c-c++-common/goacc/routine-1.c gcc/testsuite/c-c++-common/goacc/routine-1.c
new file mode 100644
index 0000000..1f89fdb
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/routine-1.c
@@ -0,0 +1,35 @@
+void *malloc (__SIZE_TYPE__);
+void free (void *);
+
+#pragma acc routine
+int
+fact (int n)
+{
+  if (n == 0 || n == 1)
+    return 1;
+
+  return n * fact (n - 1);
+}
+
+int
+main(int argc, char **argv)
+{
+  int *a, i, n = 10;
+
+  a = (int *)malloc (sizeof (int) * n);
+
+#pragma acc parallel copy (a[0:n]) vector_length (5)
+  {
+#pragma acc loop
+    for (i = 0; i < n; i++)
+      a[i] = fact (i);
+  }
+
+  for (i = 0; i < n; i++)
+    if (fact (i) != a[i])
+      return -1;
+
+  free (a);
+
+  return 0;
+}
diff --git gcc/testsuite/c-c++-common/goacc/routine-2.c gcc/testsuite/c-c++-common/goacc/routine-2.c
new file mode 100644
index 0000000..fe2e7f7
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/routine-2.c
@@ -0,0 +1,36 @@
+void *malloc (__SIZE_TYPE__);
+void free (void *);
+
+#pragma acc routine (fact)
+
+int
+fact (int n)
+{
+  if (n == 0 || n == 1)
+    return 1;
+
+  return n * fact (n - 1);
+}
+
+int
+main(int argc, char **argv)
+{
+  int *a, i, n = 10;
+
+  a = (int *)malloc (sizeof (int) * n);
+
+#pragma acc parallel copy (a[0:n]) vector_length (5)
+  {
+#pragma acc loop
+    for (i = 0; i < n; i++)
+      a[i] = fact (i);
+  }
+
+  for (i = 0; i < n; i++)
+    if (fact (i) != a[i])
+      return -1;
+
+  free (a);
+
+  return 0;
+}
diff --git gcc/testsuite/c-c++-common/goacc/routine-3.c gcc/testsuite/c-c++-common/goacc/routine-3.c
new file mode 100644
index 0000000..e35dfc1
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/routine-3.c
@@ -0,0 +1,52 @@
+/* Test valid use of clauses with routine.  */
+/* { dg-do compile } */
+
+#pragma acc routine gang
+void
+f1 (void)
+{
+}
+
+#pragma acc routine worker
+void
+f2 (void)
+{
+}
+
+#pragma acc routine vector
+void
+f3 (void)
+{
+}
+
+#pragma acc routine seq
+void
+f4 (void)
+{
+}
+
+#pragma acc routine bind (f4a)
+void
+f5 (void)
+{
+}
+
+typedef int T;
+
+#pragma acc routine bind (T)
+void
+f6 (void)
+{
+}
+
+#pragma acc routine bind ("f7a")
+void
+f7 (void)
+{
+}
+
+#pragma acc routine nohost
+void
+f8 (void)
+{
+}
diff --git gcc/testsuite/c-c++-common/goacc/routine-4.c gcc/testsuite/c-c++-common/goacc/routine-4.c
new file mode 100644
index 0000000..682d901
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/routine-4.c
@@ -0,0 +1,87 @@
+/* Test invalid use of clauses with routine.  */
+/* { dg-do compile } */
+
+#pragma acc routine gang worker /* { dg-error "invalid combination" } */
+void
+f1 (void)
+{
+}
+
+#pragma acc routine worker gang /* { dg-error "invalid combination" } */
+void
+f1a (void)
+{
+}
+
+#pragma acc routine gang vector /* { dg-error "invalid combination" } */
+void
+f2 (void)
+{
+}
+
+#pragma acc routine vector gang /* { dg-error "invalid combination" } */
+void
+f2a (void)
+{
+}
+
+#pragma acc routine gang seq /* { dg-error "invalid combination" } */
+void
+f3 (void)
+{
+}
+
+#pragma acc routine seq gang /* { dg-error "invalid combination" } */
+void
+f3a (void)
+{
+}
+
+#pragma acc routine worker vector /* { dg-error "invalid combination" } */
+void
+f4 (void)
+{
+}
+
+#pragma acc routine vector worker /* { dg-error "invalid combination" } */
+void
+f4a (void)
+{
+}
+
+#pragma acc routine worker seq /* { dg-error "invalid combination" } */
+void
+f5 (void)
+{
+}
+
+#pragma acc routine seq worker /* { dg-error "invalid combination" } */
+void
+f5a (void)
+{
+}
+
+#pragma acc routine vector seq /* { dg-error "invalid combination" } */
+void
+f6 (void)
+{
+}
+
+#pragma acc routine seq vector /* { dg-error "invalid combination" } */
+void
+f6a (void)
+{
+}
+
+#pragma acc routine (g1) gang worker /* { dg-error "invalid combination" } */
+#pragma acc routine (g2) worker gang /* { dg-error "invalid combination" } */
+#pragma acc routine (g3) gang vector /* { dg-error "invalid combination" } */
+#pragma acc routine (g4) vector gang /* { dg-error "invalid combination" } */
+#pragma acc routine (g5) gang seq /* { dg-error "invalid combination" } */
+#pragma acc routine (g6) seq gang /* { dg-error "invalid combination" } */
+#pragma acc routine (g7) worker vector /* { dg-error "invalid combination" } */
+#pragma acc routine (g8) vector worker /* { dg-error "invalid combination" } */
+#pragma acc routine (g9) worker seq /* { dg-error "invalid combination" } */
+#pragma acc routine (g10) seq worker /* { dg-error "invalid combination" } */
+#pragma acc routine (g11) vector seq /* { dg-error "invalid combination" } */
+#pragma acc routine (g12) seq vector /* { dg-error "invalid combination" } */
diff --git gcc/testsuite/c-c++-common/goacc/tile.c gcc/testsuite/c-c++-common/goacc/tile.c
new file mode 100644
index 0000000..e127955
--- /dev/null
+++ gcc/testsuite/c-c++-common/goacc/tile.c
@@ -0,0 +1,26 @@
+int
+main ()
+{
+  int i;
+
+#pragma acc parallel loop tile (10)
+  for (i = 0; i < 100; i++)
+    ;
+
+#pragma acc parallel loop tile (*)
+  for (i = 0; i < 100; i++)
+    ;
+
+#pragma acc parallel loop tile (10, *)
+  for (i = 0; i < 100; i++)
+    ;
+
+#pragma acc parallel loop tile (10, *, i) /* { dg-error "positive constant integer expression" } */
+  for (i = 0; i < 100; i++)
+    ;
+
+  return 0;
+}
+/* { dg-bogus "sorry, unimplemented: Clause not supported yet" "tile" { xfail *-*-* } 6 } */
+/* { dg-bogus "sorry, unimplemented: Clause not supported yet" "tile" { xfail *-*-* } 10 } */
+/* { dg-bogus "sorry, unimplemented: Clause not supported yet" "tile" { xfail *-*-* } 14 } */
diff --git gcc/testsuite/g++.dg/goacc/template-reduction.C gcc/testsuite/g++.dg/goacc/template-reduction.C
new file mode 100644
index 0000000..3618c02
--- /dev/null
+++ gcc/testsuite/g++.dg/goacc/template-reduction.C
@@ -0,0 +1,100 @@
+extern void abort ();
+
+const int n = 100;
+
+// Check explicit template copy map
+
+template<typename T> T
+sum (T array[])
+{
+   T s = 0;
+
+#pragma acc parallel loop num_gangs (10) gang reduction (+:s) copy (s, array[0:n])
+  for (int i = 0; i < n; i++)
+    s += array[i];
+
+  return s;
+}
+
+// Check implicit template copy map
+
+template<typename T> T
+sum ()
+{
+  T s = 0;
+  T array[n];
+
+  for (int i = 0; i < n; i++)
+    array[i] = i+1;
+
+#pragma acc parallel loop num_gangs (10) gang reduction (+:s) copy (s)
+  for (int i = 0; i < n; i++)
+    s += array[i];
+
+  return s;
+}
+
+// Check present and async
+
+template<typename T> T
+async_sum (T array[])
+{
+   T s = 0;
+
+#pragma acc parallel loop num_gangs (10) gang async (1) present (array[0:n])
+   for (int i = 0; i < n; i++)
+     array[i] = i+1;
+
+#pragma acc parallel loop num_gangs (10) gang reduction (+:s) present (array[0:n]) copy (s) async wait (1)
+  for (int i = 0; i < n; i++)
+    s += array[i];
+
+#pragma acc wait
+
+  return s;
+}
+
+// Check present and async
+
+template<typename T> T
+async_sum (int c)
+{
+   T s = 0;
+
+#pragma acc parallel loop num_gangs (10) gang reduction (+:s) copy(s) async wait (1)
+  for (int i = 0; i < n; i++)
+    s += i;
+
+#pragma acc wait
+
+  return s;
+}
+
+int
+main()
+{
+  int a[n];
+  int result = 0;
+
+  for (int i = 0; i < n; i++)
+    {
+      a[i] = i+1;
+      result += i+1;
+    }
+
+  if (sum (a) != result)
+    abort ();
+
+  if (sum<int> () != result)
+    abort ();
+
+#pragma acc enter data copyin (a)
+  if (async_sum (a) != result)
+    abort ();
+
+  if (async_sum<int> (1) != result)
+    abort ();
+#pragma acc exit data delete (a)
+
+  return 0;
+}
diff --git gcc/testsuite/g++.dg/goacc/template.C gcc/testsuite/g++.dg/goacc/template.C
new file mode 100644
index 0000000..497c004
--- /dev/null
+++ gcc/testsuite/g++.dg/goacc/template.C
@@ -0,0 +1,131 @@
+#include <cstdio>
+
+#pragma acc routine
+template <typename T> T
+accDouble(int val)
+{
+  return val * 2;
+}
+
+template<typename T> T
+oacc_parallel_copy (T a)
+{
+  T b = 0;
+  char w = 1;
+  int x = 2;
+  float y = 3;
+  double z = 4;
+
+#pragma acc parallel num_gangs (a) num_workers (a) vector_length (a) default (none) copyout (b) copyin (a)
+  {
+    b = a;
+  }
+
+#pragma acc parallel num_gangs (a) copy (w, x, y, z)
+  {
+    w = accDouble<char>(w);
+    x = accDouble<int>(x);
+    y = accDouble<float>(y);
+    z = accDouble<double>(z);
+  }
+
+#pragma acc parallel num_gangs (a) if (1)
+  {
+#pragma acc loop independent collapse (2) device_type (nvidia) gang
+  for (int i = 0; i < a; i++)
+    for (int j = 0; j < 5; j++)
+      b = a;
+  }
+
+  T c;
+
+#pragma acc parallel num_workers (10)
+  {
+#pragma acc atomic capture
+    c = b++;
+
+#pragma atomic update
+    c++;
+
+#pragma acc atomic read
+    b = a;
+
+#pragma acc atomic write
+    b = a;
+  }
+
+#pragma acc parallel reduction (+:c)
+  {
+    c = 1;
+  }
+
+#pragma acc data if (1) copy (b)
+  {
+    #pragma acc parallel
+    {
+      b = a;
+    }
+  }
+
+#pragma acc enter data copyin (b)
+#pragma acc parallel present (b)
+    {
+      b = a;
+    }
+
+#pragma acc update host (b)
+#pragma acc update self (b)
+#pragma acc update device (b)
+#pragma acc exit data delete (b)
+
+  return b;
+}
+
+template<typename T> T
+oacc_kernels_copy (T a)
+{
+  T b = 0;
+  T c = 0;
+  char w = 1;
+  int x = 2;
+  float y = 3;
+  double z = 4;
+
+#pragma acc kernels copy (w, x, y, z)
+  {
+    w = accDouble<char>(w);
+    x = accDouble<int>(x);
+    y = accDouble<float>(y);
+    z = accDouble<double>(z);
+  }
+
+#pragma acc kernels copyout (b) copyin (a)
+  b = a;
+
+#pragma acc data if (1) copy (b)
+  {
+    #pragma acc kernels
+    {
+      b = a;
+    }
+  }
+
+#pragma acc enter data copyin (b)
+#pragma acc kernels present (b)
+    {
+      b = a;
+    }
+  return b;
+}
+
+int
+main ()
+{
+  int b = oacc_parallel_copy<int> (5);
+  int c = oacc_kernels_copy<int> (5);
+
+  printf ("b = %d\n", b);
+  printf ("c = %d\n", c);
+
+  return 0;
+}
diff --git gcc/testsuite/gfortran.dg/goacc/cache-1.f95 gcc/testsuite/gfortran.dg/goacc/cache-1.f95
index 746cf02..74ab332 100644
--- gcc/testsuite/gfortran.dg/goacc/cache-1.f95
+++ gcc/testsuite/gfortran.dg/goacc/cache-1.f95
@@ -9,4 +9,3 @@ program test
     !$acc cache (d)
   enddo
 end
-! { dg-prune-output "unimplemented" }
diff --git gcc/testsuite/gfortran.dg/goacc/coarray.f95 gcc/testsuite/gfortran.dg/goacc/coarray.f95
index 4f1224e..08e4004 100644
--- gcc/testsuite/gfortran.dg/goacc/coarray.f95
+++ gcc/testsuite/gfortran.dg/goacc/coarray.f95
@@ -32,4 +32,4 @@ contains
     !$acc update self (a)
   end subroutine oacc1
 end module test
-! { dg-prune-output "ACC cache unimplemented" }
+! { dg-bogus "sorry, unimplemented: directive not yet implemented" "host_data" { xfail *-*-* } 19 }
diff --git gcc/testsuite/gfortran.dg/goacc/coarray_2.f90 gcc/testsuite/gfortran.dg/goacc/coarray_2.f90
index f35d4b9..06a2bed 100644
--- gcc/testsuite/gfortran.dg/goacc/coarray_2.f90
+++ gcc/testsuite/gfortran.dg/goacc/coarray_2.f90
@@ -2,6 +2,7 @@
 ! { dg-additional-options "-fcoarray=lib" }
 !
 ! PR fortran/63861
+! { dg-xfail-if "<http://gcc.gnu.org/PR63861>" { *-*-* } } */
 
 module test
 contains
diff --git gcc/testsuite/gfortran.dg/goacc/combined_loop.f90 gcc/testsuite/gfortran.dg/goacc/combined_loop.f90
index b8be649..58aaa4f 100644
--- gcc/testsuite/gfortran.dg/goacc/combined_loop.f90
+++ gcc/testsuite/gfortran.dg/goacc/combined_loop.f90
@@ -6,7 +6,7 @@ subroutine oacc1()
   implicit none
   integer :: i
   integer  :: a
-  !$acc parallel loop reduction(+:a) ! { dg-excess-errors "sorry, unimplemented: directive not yet implemented" }
+  !$acc parallel loop reduction(+:a)
   do i = 1,5
   enddo
 end subroutine oacc1
diff --git gcc/testsuite/gfortran.dg/goacc/cray.f95 gcc/testsuite/gfortran.dg/goacc/cray.f95
index 8f2c077..28294ee 100644
--- gcc/testsuite/gfortran.dg/goacc/cray.f95
+++ gcc/testsuite/gfortran.dg/goacc/cray.f95
@@ -53,4 +53,3 @@ contains
     !$acc update self (ptr)
   end subroutine oacc1
 end module test
-! { dg-prune-output "unimplemented" }
diff --git gcc/testsuite/gfortran.dg/goacc/declare-1.f95 gcc/testsuite/gfortran.dg/goacc/declare-1.f95
index 03540f1..14190a7 100644
--- gcc/testsuite/gfortran.dg/goacc/declare-1.f95
+++ gcc/testsuite/gfortran.dg/goacc/declare-1.f95
@@ -15,6 +15,5 @@ contains
     END BLOCK
   end function foo
 end program test
-! { dg-prune-output "unimplemented" }
-! { dg-final { scan-tree-dump-times "pragma acc declare map\\(force_tofrom:i\\)" 2 "original" } } 
+! { dg-final { scan-tree-dump-times "pragma acc data map\\(force_tofrom:i\\)" 2 "original" } }
 ! { dg-final { cleanup-tree-dump "original" } } 
diff --git gcc/testsuite/gfortran.dg/goacc/declare-2.f95 gcc/testsuite/gfortran.dg/goacc/declare-2.f95
new file mode 100644
index 0000000..afdbe2e
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/declare-2.f95
@@ -0,0 +1,44 @@
+
+module amod
+
+contains
+
+subroutine asubr (b)
+  implicit none
+  integer :: b(8)
+
+  !$acc declare copy (b) ! { dg-error "Invalid clause in module" }
+  !$acc declare copyout (b) ! { dg-error "Invalid clause in module" }
+  !$acc declare present (b) ! { dg-error "Invalid clause in module" }
+  !$acc declare present_or_copy (b) ! { dg-error "Invalid clause in module" }
+  !$acc declare present_or_copyin (b) ! { dg-error "Invalid clause in module" }
+  !$acc declare present_or_copyout (b) ! { dg-error "Invalid clause in module" }
+  !$acc declare present_or_create (b) ! { dg-error "Invalid clause in module" }
+  !$acc declare deviceptr (b) ! { dg-error "Invalid clause in module" }
+  !$acc declare create (b) copyin (b) ! { dg-error "present on multiple clauses" }
+
+end subroutine
+
+end module
+
+subroutine bsubr (foo)
+  implicit none
+
+  integer, dimension (:) :: foo
+
+  !$acc declare copy (foo) ! { dg-error "assumed-size dummy array" }
+  !$acc declare copy (foo(1:2)) ! { dg-error "assumed-size dummy array" }
+
+end subroutine
+
+program test
+  integer :: a(8)
+  integer :: b(8)
+  integer :: c(8)
+
+  !$acc declare create (a) copyin (a) ! { dg-error "present on multiple clauses" }
+  !$acc declare copyin (b)
+  !$acc declare copyin (b) ! { dg-error "present on multiple clauses" }
+  !$acc declare copy (c(1:2)) ! { dg-error "Subarray: 'c' not allowed" }
+
+end program
diff --git gcc/testsuite/gfortran.dg/goacc/default.f95 gcc/testsuite/gfortran.dg/goacc/default.f95
new file mode 100644
index 0000000..c1fc52e
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/default.f95
@@ -0,0 +1,17 @@
+! { dg-do compile }
+
+program tile
+  integer i, j, a
+
+  !$acc parallel default (shared) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc end parallel ! { dg-error "Unexpected" }
+
+  !$acc parallel default (private) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc end parallel ! { dg-error "Unexpected" }
+
+  !$acc parallel default (none)
+  !$acc end parallel
+
+  !$acc parallel default (firstprivate) ! { dg-error "Unclassifiable OpenACC directive" }
+  !$acc end parallel ! { dg-error "Unexpected" }
+end program tile
diff --git gcc/testsuite/gfortran.dg/goacc/dtype-1.f95 gcc/testsuite/gfortran.dg/goacc/dtype-1.f95
new file mode 100644
index 0000000..350e443
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/dtype-1.f95
@@ -0,0 +1,161 @@
+! { dg-do compile }
+! { dg-options "-fopenacc -fdump-tree-omplower" }
+
+program dtype
+  integer i1
+
+!! ACC PARALLEL DEVICE_TYPE:
+
+!$acc parallel dtype (nVidia) async (1) num_gangs (100) &
+!$acc&  num_workers (100) vector_length (32) wait (1)
+!$acc end parallel
+
+!$acc parallel async (1) num_gangs (1) num_workers (1) vector_length (1) &
+!$acc& wait (1) device_type (nvidia) async (2) num_gangs (200) &
+!$acc&  num_workers (200) vector_length (64) wait (2)
+!$acc end parallel
+
+!$acc parallel async (1) num_gangs (1) num_workers (1) vector_length (1) &
+!$acc& wait (1) device_type (nvidia) async (3) num_gangs (300) &
+!$acc& num_workers (300) vector_length (128) wait (3) dtype (*) &
+!$acc& async (10) num_gangs (10) num_workers (10) vector_length (10) wait (10)
+!$acc end parallel
+
+!$acc parallel async (1) num_gangs (1) num_workers (1) vector_length (1) &
+!$acc& wait (1) dtype (nvidia_ptx) async (3) num_gangs (300) &
+!$acc& num_workers (300) vector_length (128) wait (3) device_type (*) &
+!$acc& async (10) num_gangs (10) num_workers (10) vector_length (10) wait (10)
+!$acc end parallel
+
+!! ACC KERNELS DEVICE_TYPE:
+
+!$acc kernels device_type (nvidia) async wait
+!$acc end kernels
+
+!$acc kernels async wait dtype (nvidia) async (1) wait (1)
+!$acc end kernels
+
+!$acc kernels async wait dtype (nvidia) async (2) wait (2) &
+!$acc& device_type (*) async (0) wait (0)
+!$acc end kernels
+
+!$acc kernels async wait device_type (nvidia_ptx) async (1) wait (1) &
+!$acc& dtype (*) async (0) wait (0)
+!$acc end kernels
+
+!! ACC LOOP DEVICE_TYPE:
+
+!$acc parallel
+!$acc loop device_type (nVidia) gang
+  do i1 = 1, 10
+  end do
+!$acc end parallel
+
+!$acc parallel
+!$acc loop dtype (nVidia) gang dtype (*) worker
+  do i1 = 1, 10
+  end do
+!$acc end parallel
+
+!$acc parallel
+!$acc loop dtype (nVidiaGPU) gang dtype (*) vector
+  do i1 = 1, 10
+  end do
+!$acc end parallel
+
+!! ACC UPDATE:
+
+!$acc update host(i1) async(1) wait (1)
+
+!$acc update host(i1) device_type(nvidia) async(2) wait (2)
+
+!$acc update host(i1) async(1) wait (1) dtype(nvidia) async(3) wait (3)
+
+!$acc update host(i1) async(4) wait (4) device_type(nvidia) async(5) wait (5) &
+!$acc& dtype (*) async (6) wait (6)
+
+!$acc update host(i1) async(4) wait (4) dtype(nvidia1) async(5) &
+!$acc& wait (5) device_type (*) async (6) wait (6)
+end program dtype
+
+!! ACC ROUTINE:
+
+subroutine sr1 ()
+  !$acc routine device_type (nvidia) gang
+end subroutine sr1
+
+subroutine sr2 ()
+  !$acc routine dtype (nvidia) worker
+end subroutine sr2
+
+subroutine sr3 ()
+  !$acc routine device_type (nvidia) vector
+end subroutine sr3
+
+subroutine sr5 ()
+  !$acc routine dtype (nvidia) bind (foo)
+end subroutine sr5
+
+subroutine sr1a ()
+  !$acc routine device_type (nvidia) gang device_type (*) worker
+end subroutine sr1a
+
+subroutine sr2a ()
+  !$acc routine dtype (nvidia) worker dtype (*) vector
+end subroutine sr2a
+
+subroutine sr3a ()
+  !$acc routine dtype (nvidia) vector device_type (*) gang
+end subroutine sr3a
+
+subroutine sr4a ()
+  !$acc routine device_type (nvidia) vector device_type (*) worker
+end subroutine sr4a
+
+subroutine sr5a ()
+  !$acc routine device_type (nvidia) bind (foo) dtype (*) gang
+end subroutine sr5a
+
+subroutine sr1b ()
+  !$acc routine dtype (gpu) gang dtype (*) worker
+end subroutine sr1b
+
+subroutine sr2b ()
+  !$acc routine dtype (gpu) worker device_type (*) worker
+end subroutine sr2b
+
+subroutine sr3b ()
+  !$acc routine device_type (gpu) vector device_type (*) worker
+end subroutine sr3b
+
+subroutine sr4b ()
+  !$acc routine device_type (gpu) worker device_type (*) worker
+end subroutine sr4b
+
+subroutine sr5b ()
+  !$acc routine dtype (gpu) bind (foo) device_type (*) gang
+end subroutine sr5b
+
+! { dg-final { scan-tree-dump-times "oacc_parallel async\\(1\\) wait\\(1\\) num_gangs\\(100\\) num_workers\\(100\\) vector_length\\(32\\)" 1 "omplower" } }
+
+! { dg-final { scan-tree-dump-times "oacc_parallel async\\(2\\) wait\\(2\\) num_gangs\\(200\\) num_workers\\(200\\) vector_length\\(64\\)" 1 "omplower" } }
+
+! { dg-final { scan-tree-dump-times "oacc_parallel async\\(3\\) wait\\(3\\) num_gangs\\(300\\) num_workers\\(300\\) vector_length\\(128\\)" 1 "omplower" } }
+
+! { dg-final { scan-tree-dump-times "oacc_parallel async\\(10\\) wait\\(10\\) num_gangs\\(10\\) num_workers\\(10\\) vector_length\\(10\\)" 1 "omplower" } }
+
+! { dg-final { scan-tree-dump-times "oacc_kernels async\\(-1\\)" 1 "omplower" } }
+
+! { dg-final { scan-tree-dump-times "oacc_kernels async\\(1\\) wait\\(1\\)" 1 "omplower" } }
+
+! { dg-final { scan-tree-dump-times "oacc_kernels async\\(2\\) wait\\(2\\)" 1 "omplower" } }
+
+! { dg-final { scan-tree-dump-times "oacc_kernels async\\(0\\) wait\\(0\\)" 1 "omplower" } }
+
+! { dg-final { scan-tree-dump-times "acc loop private\\(i1\\) gang private\\(i1\\.1\\)" 1 "omplower" } }
+
+! { dg-final { scan-tree-dump-times "acc loop private\\(i1\\) gang private\\(i1\\.2\\)" 1 "omplower" } }
+
+! { dg-final { scan-tree-dump-times "acc loop private\\(i1\\) vector private\\(i1\\.3\\)" 1 "omplower" } }
+
+! { dg-final { cleanup-tree-dump "omplower" } }
diff --git gcc/testsuite/gfortran.dg/goacc/dtype-2.f95 gcc/testsuite/gfortran.dg/goacc/dtype-2.f95
new file mode 100644
index 0000000..a4573e9
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/dtype-2.f95
@@ -0,0 +1,39 @@
+! { dg-do compile }
+
+program dtype
+  integer i1, i2, i3, i4, i5, i6
+
+!! ACC PARALLEL DEVICE_TYPE:
+
+!$acc parallel device_type (nVidia) async (1) num_gangs (100) &
+!$acc&  num_workers (100) vector_length (32) wait (1) copy (i1)
+!$acc end parallel
+
+!! ACC KERNELS DEVICE_TYPE:
+
+!$acc kernels dtype (nvidia) async wait copy (i1)
+!$acc end kernels
+
+!! ACC LOOP DEVICE_TYPE:
+
+!$acc parallel
+!$acc loop dtype (nVidia) gang tile (1) private (i1)
+  do i1 = 1, 10
+  end do
+!$acc end parallel
+
+!! ACC UPDATE:
+
+!$acc update host(i1) device_type(nvidia) async(2) wait (2) self(i2)
+
+end program dtype
+
+! { dg-error "Invalid character" "" { target *-*-* } 8 }
+! { dg-error "Unexpected" "" { target *-*-* } 10 }
+
+! { dg-error "Invalid character" "" { target *-*-* } 14 }
+! { dg-error "Unexpected" "" { target *-*-* } 15 }
+
+! { dg-error "Invalid character" "" { target *-*-* } 20 }
+
+! { dg-error "Invalid character" "" { target *-*-* } 27 }
diff --git gcc/testsuite/gfortran.dg/goacc/host_data-tree.f95 gcc/testsuite/gfortran.dg/goacc/host_data-tree.f95
index 19e7411..8a25829 100644
--- gcc/testsuite/gfortran.dg/goacc/host_data-tree.f95
+++ gcc/testsuite/gfortran.dg/goacc/host_data-tree.f95
@@ -8,6 +8,6 @@ program test
   !$acc host_data use_device(i)
   !$acc end host_data
 end program test
-! { dg-prune-output "unimplemented" }
+! { dg-bogus "sorry, unimplemented: directive not yet implemented" "host_data" { xfail *-*-* } 8 }
 ! { dg-final { scan-tree-dump-times "pragma acc host_data use_device\\(i\\)" 1 "original" } } 
 ! { dg-final { cleanup-tree-dump "original" } } 
diff --git gcc/testsuite/gfortran.dg/goacc/loop-1.f95 gcc/testsuite/gfortran.dg/goacc/loop-1.f95
index e1b2dfd..817039f 100644
--- gcc/testsuite/gfortran.dg/goacc/loop-1.f95
+++ gcc/testsuite/gfortran.dg/goacc/loop-1.f95
@@ -168,4 +168,3 @@ subroutine test1
 end subroutine test1
 end module test
 ! { dg-prune-output "Deleted" }
-! { dg-prune-output "ACC cache unimplemented" }
diff --git gcc/testsuite/gfortran.dg/goacc/loop-2.f95 gcc/testsuite/gfortran.dg/goacc/loop-2.f95
index f85691e..b5e6368 100644
--- gcc/testsuite/gfortran.dg/goacc/loop-2.f95
+++ gcc/testsuite/gfortran.dg/goacc/loop-2.f95
@@ -66,7 +66,7 @@ program test
     !$acc loop seq worker ! { dg-error "conflicts with" }
     DO i = 1,10
     ENDDO
-    !$acc loop gang worker ! { dg-error "conflicts with" }
+    !$acc loop gang worker
     DO i = 1,10
     ENDDO
 
@@ -94,10 +94,10 @@ program test
     !$acc loop seq vector ! { dg-error "conflicts with" }
     DO i = 1,10
     ENDDO
-    !$acc loop gang vector ! { dg-error "conflicts with" }
+    !$acc loop gang vector
     DO i = 1,10
     ENDDO
-    !$acc loop worker vector ! { dg-error "conflicts with" }
+    !$acc loop worker vector
     DO i = 1,10
     ENDDO
 
@@ -239,7 +239,7 @@ program test
     !$acc loop seq worker ! { dg-error "conflicts with" }
     DO i = 1,10
     ENDDO
-    !$acc loop gang worker ! { dg-error "conflicts with" }
+    !$acc loop gang worker
     DO i = 1,10
     ENDDO
 
@@ -267,10 +267,10 @@ program test
     !$acc loop seq vector ! { dg-error "conflicts with" }
     DO i = 1,10
     ENDDO
-    !$acc loop gang vector ! { dg-error "conflicts with" }
+    !$acc loop gang vector
     DO i = 1,10
     ENDDO
-    !$acc loop worker vector ! { dg-error "conflicts with" }
+    !$acc loop worker vector
     DO i = 1,10
     ENDDO
 
@@ -392,7 +392,7 @@ program test
   !$acc kernels loop seq worker ! { dg-error "conflicts with" }
   DO i = 1,10
   ENDDO
-  !$acc kernels loop gang worker ! { dg-error "conflicts with" }
+  !$acc kernels loop gang worker
   DO i = 1,10
   ENDDO
 
@@ -420,10 +420,10 @@ program test
   !$acc kernels loop seq vector ! { dg-error "conflicts with" }
   DO i = 1,10
   ENDDO
-  !$acc kernels loop gang vector ! { dg-error "conflicts with" }
+  !$acc kernels loop gang vector
   DO i = 1,10
   ENDDO
-  !$acc kernels loop worker vector ! { dg-error "conflicts with" }
+  !$acc kernels loop worker vector
   DO i = 1,10
   ENDDO
 
@@ -544,7 +544,7 @@ program test
   !$acc parallel loop seq worker ! { dg-error "conflicts with" }
   DO i = 1,10
   ENDDO
-  !$acc parallel loop gang worker ! { dg-error "conflicts with" }
+  !$acc parallel loop gang worker
   DO i = 1,10
   ENDDO
 
@@ -572,10 +572,10 @@ program test
   !$acc parallel loop seq vector ! { dg-error "conflicts with" }
   DO i = 1,10
   ENDDO
-  !$acc parallel loop gang vector ! { dg-error "conflicts with" }
+  !$acc parallel loop gang vector
   DO i = 1,10
   ENDDO
-  !$acc parallel loop worker vector ! { dg-error "conflicts with" }
+  !$acc parallel loop worker vector
   DO i = 1,10
   ENDDO
 
@@ -646,4 +646,4 @@ program test
   !$acc parallel loop gang worker tile(*) 
   DO i = 1,10
   ENDDO
-end
\ No newline at end of file
+end
diff --git gcc/testsuite/gfortran.dg/goacc/modules.f95 gcc/testsuite/gfortran.dg/goacc/modules.f95
new file mode 100644
index 0000000..19a2abe
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/modules.f95
@@ -0,0 +1,55 @@
+! { dg-do compile } 
+
+MODULE reduction_test
+
+CONTAINS
+
+SUBROUTINE reduction_kernel(x_min,x_max,y_min,y_max,arr,sum)
+
+  IMPLICIT NONE
+
+  INTEGER      :: x_min,x_max,y_min,y_max
+  REAL(KIND=8), DIMENSION(x_min-2:x_max+2,y_min-2:y_max+2) :: arr
+  REAL(KIND=8) :: sum
+
+  INTEGER      :: j,k
+
+  sum=0.0
+
+!$ACC DATA PRESENT(arr) COPY(sum)
+!$ACC PARALLEL LOOP REDUCTION(+ : sum)
+  DO k=y_min,y_max
+    DO j=x_min,x_max
+      sum=sum*arr(j,k)
+    ENDDO
+  ENDDO
+!$ACC END PARALLEL LOOP
+!$ACC END DATA
+
+END SUBROUTINE reduction_kernel
+
+END MODULE reduction_test
+
+program main
+    use reduction_test
+
+    integer :: x_min,x_max,y_min,y_max
+    real(kind=8), dimension(1:10,1:10) :: arr
+    real(kind=8) :: sum
+
+    x_min = 5
+    x_max = 6
+    y_min = 5
+    y_max = 6
+
+    arr(:,:) = 1.0
+
+    sum = 1.0
+
+    !$acc data copy(arr)
+
+    call field_summary_kernel(x_min,x_max,y_min,y_max,arr,sum)
+
+    !$acc end data
+
+end program
diff --git gcc/testsuite/gfortran.dg/goacc/parameter.f95 gcc/testsuite/gfortran.dg/goacc/parameter.f95
index 1364181..82c25ba 100644
--- gcc/testsuite/gfortran.dg/goacc/parameter.f95
+++ gcc/testsuite/gfortran.dg/goacc/parameter.f95
@@ -29,4 +29,3 @@ contains
     !$acc update self (a) ! { dg-error "not a variable" }
   end subroutine oacc1
 end module test
-! { dg-prune-output "unimplemented" }
diff --git gcc/testsuite/gfortran.dg/goacc/update.f95 gcc/testsuite/gfortran.dg/goacc/update.f95
new file mode 100644
index 0000000..ae23dfc
--- /dev/null
+++ gcc/testsuite/gfortran.dg/goacc/update.f95
@@ -0,0 +1,5 @@
+! { dg-do compile } 
+
+program foo
+  !$acc update ! { dg-error "must contain at least one 'device' or 'host/self' clause" }
+end program foo
diff --git libgomp/testsuite/libgomp.oacc-c++/template-reduction.C libgomp/testsuite/libgomp.oacc-c++/template-reduction.C
new file mode 100644
index 0000000..c158b7a
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c++/template-reduction.C
@@ -0,0 +1,102 @@
+/* { dg-do run } */
+
+#include <cstdlib>
+
+const int n = 100;
+
+// Check explicit template copy map
+
+template<typename T> T
+sum (T array[])
+{
+   T s = 0;
+
+#pragma acc parallel loop vector_length (10) reduction (+:s) copy (s, array[0:n])
+  for (int i = 0; i < n; i++)
+    s += array[i];
+
+  return s;
+}
+
+// Check implicit template copy map
+
+template<typename T> T
+sum ()
+{
+  T s = 0;
+  T array[n];
+
+  for (int i = 0; i < n; i++)
+    array[i] = i+1;
+
+#pragma acc parallel loop vector_length (10) reduction (+:s) copy (s)
+  for (int i = 0; i < n; i++)
+    s += array[i];
+
+  return s;
+}
+
+// Check present and async
+
+template<typename T> T
+async_sum (T array[])
+{
+   T s = 0;
+
+#pragma acc parallel loop vector_length (10) async (1) present (array[0:n])
+   for (int i = 0; i < n; i++)
+     array[i] = i+1;
+
+#pragma acc parallel loop vector_length (10) reduction (+:s) present (array[0:n]) copy (s) async wait (1)
+  for (int i = 0; i < n; i++)
+    s += array[i];
+
+#pragma acc wait
+
+  return s;
+}
+
+// Check present and async
+
+template<typename T> T
+async_sum (int c)
+{
+   T s = 0;
+
+#pragma acc parallel loop vector_length (10) reduction (+:s) copy(s) async wait (1)
+  for (int i = 0; i < n; i++)
+    s += i+1;
+
+#pragma acc wait
+
+  return s;
+}
+
+int
+main()
+{
+  int a[n];
+  int result = 0;
+
+  for (int i = 0; i < n; i++)
+    {
+      a[i] = i+1;
+      result += i+1;
+    }
+
+  if (sum (a) != result)
+    abort ();
+
+  if (sum<int> () != result)
+    abort ();
+
+#pragma acc enter data copyin (a)
+  if (async_sum (a) != result)
+    abort ();
+
+  if (async_sum<int> (1) != result)
+    abort ();
+#pragma acc exit data delete (a)
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-1.c
new file mode 100644
index 0000000..ad958cd
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-1.c
@@ -0,0 +1,866 @@
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+int
+main(int argc, char **argv)
+{
+  int   iexp, igot;
+  long long lexp, lgot;
+  int   N = 32;
+  int   idata[N];
+  long long   ldata[N];
+  float fexp, fgot;
+  float fdata[N];
+  int i;
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+#pragma acc atomic capture
+        idata[i] = igot++;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 32;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+#pragma acc atomic capture
+        idata[i] = igot--;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+#pragma acc atomic capture
+        idata[i] = ++igot;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 32;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+#pragma acc atomic capture
+        idata[i] = --igot;
+      }
+  }
+
+  /* BINOP = + */
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        idata[i] = igot += expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        idata[i] = igot = igot + expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        idata[i] = igot = expr + igot;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  /* BINOP = * */
+  lgot = 1LL;
+  lexp = 1LL << N;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 2LL;
+
+#pragma acc atomic capture
+        ldata[i] = lgot *= expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << N;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 2LL;
+
+#pragma acc atomic capture
+        ldata[i] = lgot = lgot * expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << N;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 2LL;
+
+#pragma acc atomic capture
+        ldata[i] = lgot = expr * lgot;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  /* BINOP = - */
+  igot = 32;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        idata[i] = igot -= expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 32;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        idata[i] = igot = igot - expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 32;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        idata[i] = igot = expr - igot;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+
+  /* BINOP = / */
+  lgot = 1LL << 32;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      long long expr = 2LL;
+
+#pragma acc atomic capture
+        ldata[i] = lgot /= expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL << 32;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 2LL;
+
+#pragma acc atomic capture
+        ldata[i] = lgot = lgot / expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 2LL;
+  lexp = 2LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      long long expr = 1LL << N;
+
+#pragma acc atomic capture
+        ldata[i] = lgot = expr / lgot;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  /* BINOP = & */
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1 << i;
+
+#pragma acc atomic capture
+        idata[i] = igot &= expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1 << i;
+
+#pragma acc atomic capture
+        idata[i] = igot = igot & expr;
+    }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1 << i;
+
+#pragma acc atomic capture
+        idata[i] = igot = expr & igot;
+     }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  /* BINOP = ^ */
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1 << i;
+
+#pragma acc atomic capture
+        idata[i] = igot ^= expr;
+     }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1 << i;
+
+#pragma acc atomic capture
+        idata[i] = igot = igot ^ expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1 << i;
+
+#pragma acc atomic capture
+        idata[i] = igot = expr ^ igot;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  /* BINOP = | */
+  igot = 0;
+  iexp = ~0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1 << i;
+
+#pragma acc atomic capture
+        idata[i] = igot |= expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = ~0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1 << i;
+
+#pragma acc atomic capture
+        idata[i] = igot = igot | expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = ~0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1 << i;
+
+#pragma acc atomic capture
+        idata[i] = igot = expr | igot;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  /* BINOP = << */
+  lgot = 1LL;
+  lexp = 1LL << N;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        ldata[i] = lgot <<= expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << N;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        idata[i] = lgot = lgot << expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 2LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel
+    {
+      long long expr = 1LL;
+
+#pragma acc atomic capture
+      ldata[0] = lgot = expr << lgot;
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  /* BINOP = >> */
+  lgot = 1LL << N;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic capture
+        ldata[i] = lgot >>= expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL << N;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        ldata[i] = lgot = lgot >> expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL << 63;
+  lexp = 1LL << 32;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel
+    {
+      long long expr = 1LL << 32;
+
+#pragma acc atomic capture
+      ldata[0] = lgot = expr >> lgot;
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  fgot = 0.0;
+  fexp = 32.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+#pragma acc atomic capture
+        fdata[i] = fgot++;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 32.0;
+  fexp = 0.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+#pragma acc atomic capture
+        fdata[i] = fgot--;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 0.0;
+  fexp = 32.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+#pragma acc atomic capture
+        fdata[i] = ++fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 32.0;
+  fexp = 0.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+#pragma acc atomic capture
+        fdata[i] = --fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  /* BINOP = + */
+  fgot = 0.0;
+  fexp = 32.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        fdata[i] = fgot += expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 0.0;
+  fexp = 32.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        fdata[i] = fgot = fgot + expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 0.0;
+  fexp = 32.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        fdata[i] = fgot = expr + fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  /* BINOP = * */
+  fgot = 1.0;
+  fexp = 8192.0*8192.0*64.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic capture
+        fdata[i] = fgot *= expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1.0;
+  fexp = 8192.0*8192.0*64.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 2LL;
+
+#pragma acc atomic capture
+        fdata[i] = fgot = fgot * expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1.0;
+  fexp = 8192.0*8192.0*64.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic capture
+        fdata[i] = fgot = expr * fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  /* BINOP = - */
+  fgot = 32.0;
+  fexp = 0.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        fdata[i] = fgot -= expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 32.0;
+  fexp = 0.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        fdata[i] = fgot = fgot - expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1.0;
+  fexp = 0.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 32.0;
+
+#pragma acc atomic capture
+        fdata[i] = fgot = expr - fgot;
+      }
+  }
+
+  for (i = 0; i < N; i++)
+    if (i % 2 == 0)
+      {
+	if (fdata[i] != 31.0)
+	  abort ();
+      }
+    else
+      {
+	if (fdata[i] != 1.0)
+	  abort ();
+      }
+
+
+  /* BINOP = / */
+  fexp = 1.0;
+  fgot = 8192.0*8192.0*64.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic capture
+        fdata[i] = fgot /= expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fexp = 1.0;
+  fgot = 8192.0*8192.0*64.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic capture
+        fdata[i] = fgot = fgot / expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fexp = 1.0;
+  fgot = 8192.0*8192.0*64.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel
+    {
+      float expr = 8192.0*8192.0*64.0;
+
+#pragma acc atomic capture
+      fdata[0] = fgot = expr / fgot;
+    }
+  }
+
+  if (fexp != fgot)
+    abort ();
+  
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c
new file mode 100644
index 0000000..842f2de
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c
@@ -0,0 +1,1626 @@
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+int
+main(int argc, char **argv)
+{
+  int   iexp, igot, imax, imin;
+  long long lexp, lgot;
+  int   N = 32;
+  int	i;
+  int   idata[N];
+  long long ldata[N];
+  float fexp, fgot;
+  float fdata[N];
+
+  igot = 1234;
+  iexp = 31;
+
+  for (i = 0; i < N; i++)
+    idata[i] = i;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+#pragma acc atomic capture
+      { idata[i] = igot; igot = i; }
+  }
+
+  imax = 0;
+  imin = N;
+
+  for (i = 0; i < N; i++)
+    {
+      imax = idata[i] > imax ? idata[i] : imax;
+      imin = idata[i] < imin ? idata[i] : imin;
+    }
+
+  if (imax != 1234 || imin != 0)
+    abort ();
+
+  return 0;
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+#pragma acc atomic capture
+      { idata[i] = igot; igot++; }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+#pragma acc atomic capture
+      { idata[i] = igot; ++igot; }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+#pragma acc atomic capture
+      { ++igot; idata[i] = igot; }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+#pragma acc atomic capture
+      { igot++; idata[i] = igot; }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 32;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+#pragma acc atomic capture
+      { idata[i] = igot; igot--; }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 32;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+#pragma acc atomic capture
+      { idata[i] = igot; --igot; }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 32;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+#pragma acc atomic capture
+      { --igot; idata[i] = igot; }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 32;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+#pragma acc atomic capture
+      { igot--; idata[i] = igot; }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  /* BINOP = + */
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        { idata[i] = igot; igot += expr; }
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        { igot += expr; idata[i] = igot; }
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        { idata[i] = igot; igot = igot + expr; }
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        { idata[i] = igot; igot = expr + igot; }
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        { igot = igot + expr; idata[i] = igot; }
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+
+  igot = 0;
+  iexp = 32;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        { igot = expr + igot; idata[i] = igot; }
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  /* BINOP = * */
+  lgot = 1LL;
+  lexp = 1LL << 32;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      long long expr = 2LL;
+
+#pragma acc atomic capture
+      { ldata[i] = lgot; lgot *= expr; }
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << 32;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 2LL;
+
+#pragma acc atomic capture
+        { lgot *= expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << 32;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 2LL;
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot = lgot * expr; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << 32;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      long long expr = 2LL;
+
+#pragma acc atomic capture
+      { ldata[i] = lgot; lgot = expr * lgot; }
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << 32;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 2LL;
+
+#pragma acc atomic capture
+        { lgot = lgot * expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << 32;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      long long expr = 2;
+
+#pragma acc atomic capture
+      { lgot = expr * lgot; ldata[i] = lgot; }
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  /* BINOP = - */
+  igot = 32;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      int expr = 1;
+
+#pragma acc atomic capture
+      { idata[i] = igot; igot -= expr; }
+    }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 32;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        { igot -= expr; idata[i] = igot; }
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 32;
+  iexp = 0;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        { idata[i] = igot; igot = igot - expr; }
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 1;
+  iexp = 1;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      int expr = 1;
+
+#pragma acc atomic capture
+      { idata[i] = igot; igot = expr - igot; }
+    }
+  }
+
+  for (i = 0; i < N; i++)
+    if (i % 2 == 0)
+      {
+	if (idata[i] != 1)
+	  abort ();
+      }
+    else
+      {
+	if (idata[i] != 0)
+	  abort ();
+      }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 1;
+  iexp = -31;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        { igot = igot - expr; idata[i] = igot; }
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 1;
+  iexp = 1;
+
+#pragma acc data copy (igot, idata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = 1;
+
+#pragma acc atomic capture
+        { igot = expr - igot; idata[i] = igot; }
+      }
+  }
+
+  for (i = 0; i < N; i++)
+    if (i % 2 == 0)
+      {
+	if (idata[i] != 0)
+	  abort ();
+      }
+    else
+      {
+	if (idata[i] != 1)
+	  abort ();
+      }
+
+  if (iexp != igot)
+    abort ();
+
+  /* BINOP = / */
+  lgot = 1LL << 32;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 2LL;
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot /= expr; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL << 32;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 2LL;
+
+#pragma acc atomic capture
+        { lgot /= expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL << 32;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      long long expr = 2LL;
+
+#pragma acc atomic capture
+      { ldata[i] = lgot; lgot = lgot / expr; }
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 2LL;
+  lexp = 2LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL << N;
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot = expr / lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 2LL;
+  lexp = 2LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL << N;
+
+#pragma acc atomic capture
+        { lgot = lgot / expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 2LL;
+  lexp = 2LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL << N;
+
+#pragma acc atomic capture
+        { lgot = expr / lgot; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  /* BINOP = & */
+  lgot = ~0LL;
+  lexp = 0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot &= expr; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = ~0LL;
+  iexp = 0LL; 
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { lgot &= expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = ~0LL;
+  lexp = 0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot = lgot & expr; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = ~0LL;
+  lexp = 0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot = expr & lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = ~0LL;
+  iexp = 0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { lgot = lgot & expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = ~0LL;
+  lexp = 0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+      { lgot = expr & lgot; ldata[i] = lgot; }
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  /* BINOP = ^ */
+  lgot = ~0LL;
+  lexp = 0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      long long expr = 1 << i;
+
+#pragma acc atomic capture
+      { ldata[i] = lgot; lgot ^= expr; }
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = ~0LL;
+  iexp = 0LL; 
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { lgot ^= expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = ~0LL;
+  lexp = 0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot = lgot ^ expr; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = ~0LL;
+  lexp = 0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+      { ldata[i] = lgot; lgot = expr ^ lgot; }
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = ~0LL;
+  iexp = 0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { lgot = lgot ^ expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = ~0LL;
+  lexp = 0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { lgot = expr ^ lgot; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  /* BINOP = | */
+  lgot = 0LL;
+  lexp = ~0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1 << i;
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot |= expr; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 0LL;
+  iexp = ~0LL; 
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { lgot |= expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 0LL;
+  lexp = ~0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot = lgot | expr; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 0LL;
+  lexp = ~0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot = expr | lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 0LL;
+  iexp = ~0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { lgot = lgot | expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 0LL;
+  lexp = ~0LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = ~(1 << i);
+
+#pragma acc atomic capture
+        { lgot = expr | lgot; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  /* BINOP = << */
+  lgot = 1LL;
+  lexp = 1LL << N;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot <<= expr; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  iexp = 1LL << N; 
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic capture
+        { lgot <<= expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << N;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot = lgot << expr; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 2LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < 1; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot = expr << lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 2LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < 1; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic capture
+        { lgot = lgot << expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 2LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < 1; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic capture
+        { lgot = expr << lgot; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  /* BINOP = >> */
+  lgot = 1LL << N;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+  
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot >>= expr; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL << N;
+  iexp = 1LL; 
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic capture
+        { lgot >>= expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL << N;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot = lgot >> expr; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << (N - 1);
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < 1; i++)
+      {
+        long long expr = 1LL << N;
+
+#pragma acc atomic capture
+        { ldata[i] = lgot; lgot = expr >> lgot; }
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL << N;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic capture
+        { lgot = lgot >> expr; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << (N - 1);
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < 1; i++)
+      {
+        long long expr = 1LL << N;
+
+#pragma acc atomic capture
+        { lgot = expr >> lgot; ldata[i] = lgot; }
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  // FLOAT FLOAT FLOAT
+
+  /* BINOP = + */
+  fgot = 0.0;
+  fexp = 32.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      float expr = 1.0;
+
+#pragma acc atomic capture
+      { fdata[i] = fgot; fgot += expr; }
+    }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 0.0;
+  fexp = 32.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        { fgot += expr; fdata[i] = fgot; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 0.0;
+  fexp = 32.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        { idata[i] = fgot; fgot = fgot + expr; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 0.0;
+  fexp = 32.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      float expr = 1.0;
+
+#pragma acc atomic capture
+      { fdata[i] = fgot; fgot = expr + fgot; }
+    }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 0.0;
+  fexp = 32.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        { fgot = fgot + expr; fdata[i] = fgot; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 0.0;
+  fexp = 32.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        { fgot = expr + fgot; fdata[i] = fgot; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  /* BINOP = * */
+  fgot = 1.0;
+  fexp = 8192.0*8192.0*64.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      float expr = 2.0;
+
+#pragma acc atomic capture
+      { fdata[i] = fgot; fgot *= expr; }
+    }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1.0;
+  fexp = 8192.0*8192.0*64.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic capture
+        { fgot *= expr; fdata[i] = fgot; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1.0;
+  fexp = 8192.0*8192.0*64.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic capture
+        { fdata[i] = fgot; fgot = fgot * expr; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1.0;
+  fexp = 8192.0*8192.0*64.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic capture
+        { fdata[i] = fgot; fgot = expr * fgot; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << 32;
+
+#pragma acc data copy (lgot, ldata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      long long expr = 2LL;
+
+#pragma acc atomic capture
+      { lgot = lgot * expr; ldata[i] = lgot; }
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  fgot = 1.0;
+  fexp = 8192.0*8192.0*64.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 2;
+
+#pragma acc atomic capture
+        { fgot = expr * fgot; fdata[i] = fgot; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  /* BINOP = - */
+  fgot = 32.0;
+  fexp = 0.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+  
+#pragma acc atomic capture
+        { fdata[i] = fgot; fgot -= expr; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 32.0;
+  fexp = 0.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+      float expr = 1.0;
+
+#pragma acc atomic capture
+      { fgot -= expr; fdata[i] = fgot; }
+    }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 32.0;
+  fexp = 0.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        { fdata[i] = fgot; fgot = fgot - expr; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1.0;
+  fexp = 1.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        { fdata[i] = fgot; fgot = expr - fgot; }
+      }
+  }
+
+  for (i = 0; i < N; i++)
+    if (i % 2 == 0)
+      {
+	if (fdata[i] != 1.0)
+	  abort ();
+      }
+    else
+      {
+	if (fdata[i] != 0.0)
+	  abort ();
+      }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1.0;
+  fexp = -31.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        { fgot = fgot - expr; fdata[i] = fgot; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1.0;
+  fexp = 1.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        { fgot = expr - fgot; fdata[i] = fgot; }
+      }
+  }
+
+  for (i = 0; i < N; i++)
+    if (i % 2 == 0)
+      {
+	if (fdata[i] != 0.0)
+	  abort ();
+      }
+    else
+      {
+	if (fdata[i] != 1.0)
+	  abort ();
+      }
+
+  if (fexp != fgot)
+    abort ();
+
+  /* BINOP = / */
+  fgot = 8192.0*8192.0*64.0;
+  fexp = 1.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic capture
+        { fdata[i] = fgot; fgot /= expr; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 8192.0*8192.0*64.0;
+  fexp = 1.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic capture
+        { fgot /= expr; fdata[i] = fgot; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 8192.0*8192.0*64.0;
+  fexp = 1.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic capture
+        { fdata[i] = fgot; fgot = fgot / expr; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 8192.0*8192.0*64.0;
+  fexp = 1.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+
+#pragma acc atomic capture
+        { fdata[i] = fgot; fgot = expr / fgot; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 4.0;
+  fexp = 4.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL << N;
+
+#pragma acc atomic capture
+        { fgot = fgot / expr; fdata[i] = fgot; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 4.0;
+  fexp = 4.0;
+
+#pragma acc data copy (fgot, fdata[0:N])
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic capture
+        { fgot = expr / fgot; fdata[i] = fgot; }
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_update-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_update-1.c
new file mode 100644
index 0000000..18ee3aa
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_update-1.c
@@ -0,0 +1,760 @@
+/* { dg-do run } */
+
+#include <stdlib.h>
+
+int
+main(int argc, char **argv)
+{
+  float fexp, fgot;
+  int   iexp, igot;
+  long long lexp, lgot;
+  int   N = 32;
+  int	i;
+
+  fgot = 1234.0;
+  fexp = 1235.0;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < 1; i++)
+#pragma acc atomic update
+      fgot++;
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = fgot - N;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+#pragma acc atomic update
+        fgot--;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = fgot + N;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+#pragma acc atomic update
+        ++fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = fgot - N;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+#pragma acc atomic update
+        --fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  /* BINOP = + */
+
+  fgot = 1234.0;
+  fexp = fgot + N;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+#pragma acc atomic update
+        fgot += expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = fgot + N;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+#pragma acc atomic update
+        fgot = fgot + expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = fgot + N;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+#pragma acc atomic update
+        fgot = expr + fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 0.5;
+#pragma acc atomic update
+        fgot = (expr + expr) + fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  /* BINOP = * */
+
+  fgot = 1234.0;
+  fexp = 1234.0;
+
+  for (i = 0; i < N; i++)
+    fexp *= 2.0;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+#pragma acc atomic update
+        fgot *= expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = 1234.0;
+
+  for (i = 0; i < N; i++)
+    fexp = fexp * 2.0;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+#pragma acc atomic update
+        fgot = fgot * expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = 1234.0;
+
+  for (i = 0; i < N; i++)
+    fexp = 2.0 * fexp;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+#pragma acc atomic update
+        fgot = expr * fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+#pragma acc atomic update
+        fgot = (expr + expr) * fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  /* BINOP = - */
+
+  fgot = 1234.0;
+  fexp = 1234.0;
+
+  for (i = 0; i < N; i++)
+    fexp -= 2.0;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+#pragma acc atomic update
+        fgot -= expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = 1234.0;
+
+  for (i = 0; i < N; i++)
+    fexp = fexp - 2.0;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+#pragma acc atomic update
+        fgot = fgot - expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = 1234.0;
+
+  for (i = 0; i < N; i++)
+    fexp = 2.0 - fexp;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic update
+        fgot = expr - fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+#pragma acc atomic update
+        fgot = (expr + expr) - fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  /* BINOP = / */
+
+  fgot = 1234.0;
+  fexp = 1234.0;
+
+  for (i = 0; i < N; i++)
+    fexp /= 2.0;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+#pragma acc atomic update
+        fgot /= expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = 1234.0;
+
+  for (i = 0; i < N; i++)
+    fexp = fexp / 2.0;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+	
+#pragma acc atomic update
+        fgot = fgot / expr;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = 1234.0;
+
+  for (i = 0; i < N; i++)
+    fexp = 2.0 / fexp;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 2.0;
+
+#pragma acc atomic update
+        fgot = expr / fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  fgot = 1234.0;
+  fexp = 1234.0;
+
+  for (i = 0; i < N; i++)
+    fexp = 2.0 / fexp;
+
+#pragma acc data copy (fgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        float expr = 1.0;
+#pragma acc atomic update
+        fgot = (expr + expr) / fgot;
+      }
+  }
+
+  if (fexp != fgot)
+    abort ();
+
+  /* BINOP = & */
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = ~(1 << i);
+
+#pragma acc atomic update
+        igot &= expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = ~(1 << i);
+#pragma acc atomic update
+        igot = igot / expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = ~(1 << i);
+#pragma acc atomic update
+        igot = expr & igot;
+     }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = ~(1 << i);
+        int zero = 0;
+
+#pragma acc atomic update
+        igot = (expr + zero) & igot;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  /* BINOP = ^ */
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = (1 << i);
+
+#pragma acc atomic update
+        igot ^= expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = (1 << i);
+
+#pragma acc atomic update
+        igot = igot ^ expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = (1 << i);
+
+#pragma acc atomic update
+        igot = expr ^ igot;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = ~0;
+  iexp = 0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = (1 << i);
+        int zero = 0;
+
+#pragma acc atomic update
+        igot = (expr + zero) ^ igot;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  /* BINOP = | */
+
+  igot = 0;
+  iexp = ~0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = (1 << i);
+
+#pragma acc atomic update
+        igot |= expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = ~0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = (1 << i);
+
+#pragma acc atomic update
+        igot = igot | expr;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = ~0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = (1 << i);
+
+#pragma acc atomic update
+        igot = expr | igot;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  igot = 0;
+  iexp = ~0;
+
+#pragma acc data copy (igot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        int expr = (1 << i);
+        int zero = 0;
+
+#pragma acc atomic update
+        igot = (expr + zero) | igot;
+      }
+  }
+
+  if (iexp != igot)
+    abort ();
+
+  /* BINOP = << */
+
+  lgot = 1LL;
+  lexp = 1LL << N;
+
+#pragma acc data copy (lgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic update
+        lgot <<= expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << N;
+
+#pragma acc data copy (lgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic update
+        lgot = lgot << expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 2LL;
+
+#pragma acc data copy (lgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < 1; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic update
+        lgot = expr << lgot;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 2LL;
+
+#pragma acc data copy (lgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < 1; i++)
+      {
+        long long expr = 1LL;
+        long long zero = 0LL;
+
+#pragma acc atomic update
+        lgot = (expr + zero) << lgot;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  /* BINOP = >> */
+
+  lgot = 1LL << N;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic update
+        lgot >>= expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL << N;
+  lexp = 1LL;
+
+#pragma acc data copy (lgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < N; i++)
+      {
+        long long expr = 1LL;
+
+#pragma acc atomic update
+        lgot = lgot >> expr;
+      }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << (N - 1);
+
+#pragma acc data copy (lgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < 1; i++)
+      {
+        long long expr = 1LL << N;
+
+#pragma acc atomic update
+        lgot = expr >> lgot;
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  lgot = 1LL;
+  lexp = 1LL << (N - 1);
+
+#pragma acc data copy (lgot)
+  {
+#pragma acc parallel loop
+    for (i = 0; i < 1; i++)
+      {
+        long long expr = 1LL << N;
+        long long zero = 0LL;
+
+#pragma acc atomic update
+        lgot = (expr + zero) >> lgot;
+    }
+  }
+
+  if (lexp != lgot)
+    abort ();
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/clauses-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/clauses-1.c
index 51c0cf5..410c46c 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/clauses-1.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/clauses-1.c
@@ -586,6 +586,32 @@ main (int argc, char **argv)
 
     for (i = 0; i < N; i++)
     {
+        a[i] = 6.0;
+        b[i] = 0.0;
+    }
+
+#pragma acc parallel pcopy (a[0:N], b[0:N])
+    {
+        int ii;
+
+        for (ii = 0; ii < N; ii++)
+            b[ii] = a[ii];
+    }
+
+    for (i = 0; i < N; i++)
+    {
+        if (b[i] != 6.0)
+            abort ();
+    }
+
+    if (acc_is_present (&a[0], (N * sizeof (float))))
+      abort ();
+
+    if (acc_is_present (&b[0], (N * sizeof (float))))
+      abort ();
+
+    for (i = 0; i < N; i++)
+    {
         a[i] = 5.0;
         b[i] = 7.0;
     }
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
index f867a66..5fc9fb6 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-2.c
@@ -25,7 +25,33 @@ main (int argc, char **argv)
     }
 
 #pragma acc enter data copyin (a[0:N]) copyin (b[0:N]) copyin (N) async
-#pragma acc parallel async wait
+#pragma acc parallel async wait present (a[0:N]) present (b[0:N]) present (N)
+#pragma acc loop
+  for (i = 0; i < N; i++)
+    b[i] = a[i];
+
+#pragma acc exit data copyout (a[0:N]) copyout (b[0:N]) delete (N) wait async
+#pragma acc wait
+
+  for (i = 0; i < N; i++)
+    {
+      if (a[i] != 3.0)
+	abort ();
+
+      if (b[i] != 3.0)
+	abort ();
+    }
+
+  for (i = 0; i < N; i++)
+    {
+      a[i] = 3.0;
+      b[i] = 0.0;
+    }
+
+#pragma acc enter data copyin (a[0:N]) async 
+#pragma acc enter data copyin (b[0:N]) async wait
+#pragma acc enter data copyin (N) async wait
+#pragma acc parallel async wait present (a[0:N]) present (b[0:N]) present (N)
 #pragma acc loop
   for (i = 0; i < N; i++)
     b[i] = a[i];
@@ -49,7 +75,7 @@ main (int argc, char **argv)
     }
 
 #pragma acc enter data copyin (a[0:N]) copyin (b[0:N]) copyin (N) async (1)
-#pragma acc parallel async (1)
+#pragma acc parallel async (1) present (a[0:N]) present (b[0:N]) present (N)
 #pragma acc loop
   for (i = 0; i < N; i++)
     b[i] = a[i];
@@ -76,17 +102,17 @@ main (int argc, char **argv)
 
 #pragma acc enter data copyin (a[0:N]) copyin (b[0:N]) copyin (c[0:N]) copyin (d[0:N]) copyin (N) async (1)
 
-#pragma acc parallel async (1) wait (1)
+#pragma acc parallel async (1) wait (1) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (N)
 #pragma acc loop
   for (i = 0; i < N; i++)
     b[i] = (a[i] * a[i] * a[i]) / a[i];
 
-#pragma acc parallel async (2) wait (1)
+#pragma acc parallel async (2) wait (1) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (N)
 #pragma acc loop
   for (i = 0; i < N; i++)
     c[i] = (a[i] + a[i] + a[i] + a[i]) / a[i];
 
-#pragma acc parallel async (3) wait (1)
+#pragma acc parallel async (3) wait (1) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (N)
 #pragma acc loop
   for (i = 0; i < N; i++)
     d[i] = ((a[i] * a[i] + a[i]) / a[i]) - a[i];
@@ -120,19 +146,19 @@ main (int argc, char **argv)
 
 #pragma acc enter data copyin (a[0:N]) copyin (b[0:N]) copyin (c[0:N]) copyin (d[0:N]) copyin (e[0:N]) copyin (N) async (1)
 
-#pragma acc parallel async (1) wait (1)
+#pragma acc parallel async (1) wait (1) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (e[0:N]) present (N)
   for (int ii = 0; ii < N; ii++)
     b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
 
-#pragma acc parallel async (2) wait (1)
+#pragma acc parallel async (2) wait (1) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (e[0:N]) present (N)
   for (int ii = 0; ii < N; ii++)
     c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
 
-#pragma acc parallel async (3) wait (1)
+#pragma acc parallel async (3) wait (1) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (e[0:N]) present (N)
   for (int ii = 0; ii < N; ii++)
     d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
 
-#pragma acc parallel wait (1) async (4)
+#pragma acc parallel wait (1) async (4) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (e[0:N]) present (N)
   for (int ii = 0; ii < N; ii++)
     e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
index 747109f..6e173d3 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-3.c
@@ -25,7 +25,7 @@ main (int argc, char **argv)
     }
 
 #pragma acc enter data copyin (a[0:N]) copyin (b[0:N]) copyin (N) async
-#pragma acc parallel async wait
+#pragma acc parallel async wait present (a[0:N]) present (b[0:N]) present (N)
 #pragma acc loop
   for (i = 0; i < N; i++)
     b[i] = a[i];
@@ -49,7 +49,7 @@ main (int argc, char **argv)
     }
 
 #pragma acc update device (a[0:N], b[0:N]) async (1)
-#pragma acc parallel async (1)
+#pragma acc parallel async (1) present (a[0:N]) present (b[0:N]) present (N)
 #pragma acc loop
   for (i = 0; i < N; i++)
     b[i] = a[i];
@@ -78,17 +78,17 @@ main (int argc, char **argv)
 #pragma acc update device (b[0:N]) async (2)
 #pragma acc enter data copyin (c[0:N], d[0:N]) async (3)
 
-#pragma acc parallel async (1) wait (1,2)
+#pragma acc parallel async (1) wait (1,2) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (N)
 #pragma acc loop
   for (i = 0; i < N; i++)
     b[i] = (a[i] * a[i] * a[i]) / a[i];
 
-#pragma acc parallel async (2) wait (1,3)
+#pragma acc parallel async (2) wait (1,3) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (N)
 #pragma acc loop
   for (i = 0; i < N; i++)
     c[i] = (a[i] + a[i] + a[i] + a[i]) / a[i];
 
-#pragma acc parallel async (3) wait (1,3)
+#pragma acc parallel async (3) wait (1,3) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (N)
 #pragma acc loop
   for (i = 0; i < N; i++)
     d[i] = ((a[i] * a[i] + a[i]) / a[i]) - a[i];
@@ -123,19 +123,19 @@ main (int argc, char **argv)
 #pragma acc update device (a[0:N], b[0:N], c[0:N], d[0:N]) async (1)
 #pragma acc enter data copyin (e[0:N]) async (5)
 
-#pragma acc parallel async (1) wait (1)
+#pragma acc parallel async (1) wait (1) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (e[0:N]) present (N)
   for (int ii = 0; ii < N; ii++)
     b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii];
 
-#pragma acc parallel async (2) wait (1)
+#pragma acc parallel async (2) wait (1) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (e[0:N]) present (N)
   for (int ii = 0; ii < N; ii++)
     c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii];
 
-#pragma acc parallel async (3) wait (1)
+#pragma acc parallel async (3) wait (1) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (e[0:N]) present (N)
   for (int ii = 0; ii < N; ii++)
     d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii];
 
-#pragma acc parallel wait (1,5) async (4)
+#pragma acc parallel wait (1,5) async (4) present (a[0:N]) present (b[0:N]) present (c[0:N]) present (d[0:N]) present (e[0:N]) present (N)
   for (int ii = 0; ii < N; ii++)
     e[ii] = a[ii] + b[ii] + c[ii] + d[ii];
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/data-clauses.h libgomp/testsuite/libgomp.oacc-c-c++-common/data-clauses.h
new file mode 100644
index 0000000..8341053
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/data-clauses.h
@@ -0,0 +1,202 @@
+int i;
+
+int main(void)
+{
+  int j, v;
+
+  i = -1;
+  j = -2;
+  v = 0;
+#pragma acc EXEC_DIRECTIVE /* copyout */ present_or_copyout (v) copyin (i, j)
+  {
+    if (i != -1 || j != -2)
+      abort ();
+    i = 2;
+    j = 1;
+    if (i != 2 || j != 1)
+      abort ();
+    v = 1;
+  }
+#if ACC_MEM_SHARED
+  if (v != 1 || i != 2 || j != 1)
+    abort ();
+#else
+  if (v != 1 || i != -1 || j != -2)
+    abort ();
+#endif
+
+  i = -1;
+  j = -2;
+  v = 0;
+#pragma acc EXEC_DIRECTIVE /* copyout */ present_or_copyout (v) copyout (i, j)
+  {
+    i = 2;
+    j = 1;
+    if (i != 2 || j != 1)
+      abort ();
+    v = 1;
+  }
+  if (v != 1 || i != 2 || j != 1)
+    abort ();
+
+  i = -1;
+  j = -2;
+  v = 0;
+#pragma acc EXEC_DIRECTIVE /* copyout */ present_or_copyout (v) copy (i, j)
+  {
+    if (i != -1 || j != -2)
+      abort ();
+    i = 2;
+    j = 1;
+    if (i != 2 || j != 1)
+      abort ();
+    v = 1;
+  }
+  if (v != 1 || i != 2 || j != 1)
+    abort ();
+
+  i = -1;
+  j = -2;
+  v = 0;
+#pragma acc EXEC_DIRECTIVE /* copyout */ present_or_copyout (v) create (i, j)
+  {
+    i = 2;
+    j = 1;
+    if (i != 2 || j != 1)
+      abort ();
+    v = 1;
+  }
+#if ACC_MEM_SHARED
+  if (v != 1 || i != 2 || j != 1)
+    abort ();
+#else
+  if (v != 1 || i != -1 || j != -2)
+    abort ();
+#endif
+
+  i = -1;
+  j = -2;
+  v = 0;
+#pragma acc EXEC_DIRECTIVE /* copyout */ present_or_copyout (v) present_or_copyin (i, j)
+  {
+    if (i != -1 || j != -2)
+      abort ();
+    i = 2;
+    j = 1;
+    if (i != 2 || j != 1)
+      abort ();
+    v = 1;
+  }
+  if (v != 1)
+    abort ();
+#if ACC_MEM_SHARED
+  if (v != 1 || i != 2 || j != 1)
+    abort ();
+#else
+  if (v != 1 || i != -1 || j != -2)
+    abort ();
+#endif
+
+  i = -1;
+  j = -2;
+  v = 0;
+#pragma acc EXEC_DIRECTIVE /* copyout */ present_or_copyout (v) present_or_copyout (i, j)
+  {
+    i = 2;
+    j = 1;
+    if (i != 2 || j != 1)
+      abort ();
+    v = 1;
+  }
+  if (v != 1 || i != 2 || j != 1)
+    abort ();
+
+  i = -1;
+  j = -2;
+  v = 0;
+#pragma acc EXEC_DIRECTIVE /* copyout */ present_or_copyout (v) present_or_copy (i, j)
+  {
+    if (i != -1 || j != -2)
+      abort ();
+    i = 2;
+    j = 1;
+    if (i != 2 || j != 1)
+      abort ();
+    v = 1;
+  }
+  if (v != 1 || i != 2 || j != 1)
+    abort ();
+
+  i = -1;
+  j = -2;
+  v = 0;
+#pragma acc EXEC_DIRECTIVE /* copyout */ present_or_copyout (v) present_or_create (i, j)
+  {
+    i = 2;
+    j = 1;
+    if (i != 2 || j != 1)
+      abort ();
+    v = 1;
+  }
+  if (v != 1)
+    abort ();
+#if ACC_MEM_SHARED
+  if (v != 1 || i != 2 || j != 1)
+    abort ();
+#else
+  if (v != 1 || i != -1 || j != -2)
+    abort ();
+#endif
+
+  i = -1;
+  j = -2;
+  v = 0;
+
+#pragma acc data copyin (i, j)
+  {
+#pragma acc EXEC_DIRECTIVE /* copyout */ present_or_copyout (v) present (i, j)
+    {
+      if (i != -1 || j != -2)
+	abort ();
+      i = 2;
+      j = 1;
+      if (i != 2 || j != 1)
+	abort ();
+      v = 1;
+    }
+  }
+#if ACC_MEM_SHARED
+  if (v != 1 || i != 2 || j != 1)
+    abort ();
+#else
+  if (v != 1 || i != -1 || j != -2)
+    abort ();
+#endif
+
+  i = -1;
+  j = -2;
+  v = 0;
+
+#pragma acc data copyin(i, j)
+  {
+#pragma acc EXEC_DIRECTIVE /* copyout */ present_or_copyout (v)
+    {
+      if (i != -1 || j != -2)
+	abort ();
+      i = 2;
+      j = 1;
+      if (i != 2 || j != 1)
+	abort ();
+      v = 1;
+    }
+  }
+#if ACC_MEM_SHARED
+  if (v != 1 || i != 2 || j != 1)
+    abort ();
+#else
+  if (v != 1 || i != -1 || j != -2)
+    abort ();
+#endif
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-1.c
index 3acfdf5..aeb0142 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-1.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-1.c
@@ -2,183 +2,5 @@
 
 #include <stdlib.h>
 
-int i;
-
-int main (void)
-{
-  int j, v;
-
-#if 0
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc kernels /* copyout */ present_or_copyout (v) copyin (i, j)
-  {
-    if (i != -1 || j != -2)
-      abort ();
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != -1 || j != -2)
-    abort ();
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc kernels /* copyout */ present_or_copyout (v) copyout (i, j)
-  {
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc kernels /* copyout */ present_or_copyout (v) copy (i, j)
-  {
-    if (i != -1 || j != -2)
-      abort ();
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc kernels /* copyout */ present_or_copyout (v) create (i, j)
-  {
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != -1 || j != -2)
-    abort ();
-#endif
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc kernels /* copyout */ present_or_copyout (v) present_or_copyin (i, j)
-  {
-    if (i != -1 || j != -2)
-      abort ();
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1)
-    abort ();
-#if ACC_MEM_SHARED
-  if (i != 2 || j != 1)
-    abort ();
-#else
-  if (i != -1 || j != -2)
-    abort ();
-#endif
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc kernels /* copyout */ present_or_copyout (v) present_or_copyout (i, j)
-  {
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc kernels /* copyout */ present_or_copyout (v) present_or_copy (i, j)
-  {
-    if (i != -1 || j != -2)
-      abort ();
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc kernels /* copyout */ present_or_copyout (v) present_or_create (i, j)
-  {
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1)
-    abort ();
-#if ACC_MEM_SHARED
-  if (i != 2 || j != 1)
-    abort ();
-#else
-  if (i != -1 || j != -2)
-    abort ();
-#endif
-
-#if 0
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc kernels /* copyout */ present_or_copyout (v) present (i, j)
-  {
-    if (i != -1 || j != -2)
-      abort ();
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-#endif
-
-#if 0
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc kernels /* copyout */ present_or_copyout (v)
-  {
-    if (i != -1 || j != -2)
-      abort ();
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-#endif
-
-  return 0;
-}
+#define EXEC_DIRECTIVE kernels
+#include "data-clauses.h"
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
index 5462f12..78c834a 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
@@ -9,46 +9,14 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuModuleLoad (&module, "./subr.ptx");
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuModuleLoad failed: %d\n", r);
@@ -62,20 +30,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
     abort ();
@@ -90,31 +44,21 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  if (acc_async_test (0) != 0)
-    {
-      fprintf (stderr, "asynchronous operation not running\n");
-      abort ();
-    }
+  if (acc_async_test (0) == 1)
+    fprintf (stderr, "expected asynchronous operation to be running\n");
 
-  sleep (1);
+  acc_wait_all ();
 
-  if (acc_async_test (0) != 1)
-    {
-      fprintf (stderr, "found asynchronous operation still running\n");
-      abort ();
-    }
+  if (acc_async_test (0) == 0)
+    fprintf (stderr, "expected asynchronous operation to be running\n");
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
index 912b266..ee06898 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
@@ -1,6 +1,7 @@
 /* { dg-do run { target openacc_nvidia_accel_selected } } */
 /* { dg-additional-options "-lcuda" } */
 
+#include <sys/time.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -10,47 +11,17 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  const int N = 10;
+  const int N = 3;
   int i;
   CUstream streams[N];
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t diff;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -65,20 +36,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   for (i = 0; i < N; i++)
     {
       streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -96,9 +53,29 @@ main (int argc, char **argv)
 	  abort ();
     }
 
+  gettimeofday (&tv1, NULL);
+
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[0], NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
+
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxLaunch failed: %d\n", r);
+      abort ();
+    }
+
+  gettimeofday (&tv2, NULL);
+
+  diff = tv2.tv_sec - tv1.tv_sec;
+
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -112,7 +89,7 @@ main (int argc, char **argv)
 	}
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  sleep ((diff + 1) * N);
 
   for (i = 0; i < N; i++)
     {
@@ -123,10 +100,6 @@ main (int argc, char **argv)
 	}
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
index e8584db..8db6bcb 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-71.c
@@ -9,45 +9,13 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -62,20 +30,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
 	{
@@ -85,7 +39,7 @@ main (int argc, char **argv)
 
   acc_set_cuda_stream (0, stream);
 
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -98,7 +52,7 @@ main (int argc, char **argv)
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  sleep (1);
 
   if (acc_async_test (1) != 1)
     {
@@ -106,11 +60,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   return 0;
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
index e383ba0..920ff5f 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
@@ -10,45 +10,13 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -63,20 +31,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
     {
@@ -87,7 +41,7 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
     
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -100,7 +54,12 @@ main (int argc, char **argv)
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.f) + 1);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize () failed: %d\n", r);
+      abort ();
+    }
 
   if (acc_async_test_all () != 1)
     {
@@ -108,11 +67,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
index 43a8b7e..4fa9d5a 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
@@ -1,6 +1,7 @@
 /* { dg-do run { target openacc_nvidia_accel_selected } } */
 /* { dg-additional-options "-lcuda" } */
 
+#include <sys/time.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -10,47 +11,15 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  const int N = 10;
+  const int N = 6;
   int i;
   CUstream streams[N];
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -65,20 +34,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   for (i = 0; i < N; i++)
     {
       streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -98,13 +53,12 @@ main (int argc, char **argv)
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
 	  abort ();
 	}
-
     }
 
   if (acc_async_test_all () != 0)
@@ -113,7 +67,12 @@ main (int argc, char **argv)
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
   if (acc_async_test_all () != 1)
     {
@@ -121,11 +80,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
index 0726ee4..e25d894 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
@@ -5,50 +5,20 @@
 #include <stdlib.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -63,19 +33,25 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
+  gettimeofday (&tv2, NULL);
 
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -91,11 +67,9 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -104,33 +78,30 @@ main (int argc, char **argv)
 
   acc_wait (0);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (atime < dtime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait (0);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.010 < atime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
index 1942211..53e285f 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
@@ -6,52 +6,22 @@
 #include <stdlib.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,18 +36,25 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  N = nprocs;
+  gettimeofday (&tv2, NULL);
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -93,16 +70,11 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -112,27 +84,18 @@ main (int argc, char **argv)
       acc_wait (0);
     }
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
index 11d9d62..787dcb8 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
@@ -6,52 +6,22 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream *streams;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,18 +36,25 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  N = nprocs;
+  gettimeofday (&tv2, NULL);
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   streams = (CUstream *) malloc (N * sizeof (void *));
 
@@ -98,16 +75,11 @@ main (int argc, char **argv)
 	  abort ();
     }
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -117,27 +89,19 @@ main (int argc, char **argv)
       acc_wait (i);
     }
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
   free (streams);
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
index 35a0980..5ef6fd9 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-77.c
@@ -6,50 +6,20 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -64,19 +34,25 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
+  gettimeofday (&tv2, NULL);
 
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
@@ -87,11 +63,9 @@ main (int argc, char **argv)
 
   acc_set_cuda_stream (0, stream);
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -100,33 +74,30 @@ main (int argc, char **argv)
 
   acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (atime < dtime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > t1)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.010 < atime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   return 0;
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
index 4f58fb2..0bed15f 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
@@ -6,50 +6,20 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -64,19 +34,25 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
+  gettimeofday (&tv2, NULL);
 
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -92,11 +68,9 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -105,33 +79,30 @@ main (int argc, char **argv)
 
   acc_wait_all ();
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (atime < dtime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > (t1 + (t1 * 0.10)))
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait_all ();
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.010 < atime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
index ef3df13..5723588 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
@@ -6,54 +6,22 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
-
-  devnum = 2;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -68,18 +36,25 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  N = nprocs;
+  gettimeofday (&tv2, NULL);
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
@@ -105,16 +80,11 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -132,7 +102,7 @@ main (int argc, char **argv)
 
   acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
   if (acc_async_test (0) != 1)
     abort ();
@@ -140,25 +110,16 @@ main (int argc, char **argv)
   if (acc_async_test (1) != 1)
     abort ();
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-80.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-80.c
index d521331..ec98119 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-80.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-80.c
@@ -6,52 +6,22 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  int N;
+  const int N = 2;
   int i;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,38 +36,40 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 200.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
-  N = nprocs;
+  gettimeofday (&tv2, NULL);
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
-	  abort ();
-	}
+    {
+      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
+      abort ();
+    }
 
   acc_set_cuda_stream (1, stream);
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -109,21 +81,18 @@ main (int argc, char **argv)
 
   acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (atime < dtime)
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  t1 *= N;
+
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   return 0;
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
index d5f18f0..77de9ba 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
@@ -6,52 +6,22 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream *streams, stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,18 +36,25 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
+  gettimeofday (&tv1, NULL);
 
-  dtime = 500.0;
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+	abort ();
+    }
 
-  dticks = (unsigned long) (dtime * clkrate);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+	abort ();
+    }
 
-  N = nprocs;
+  gettimeofday (&tv2, NULL);
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   streams = (CUstream *) malloc (N * sizeof (void *));
 
@@ -98,11 +75,6 @@ main (int argc, char **argv)
 	  abort ();
     }
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (N);
   if (stream != NULL)
     abort ();
@@ -117,11 +89,11 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (N, stream))
     abort ();
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -129,6 +101,10 @@ main (int argc, char **argv)
 	}
     }
 
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
   acc_wait_all_async (N);
 
   for (i = 0; i <= N; i++)
@@ -145,15 +121,13 @@ main (int argc, char **argv)
 	abort ();
     }
 
-  atime = stop_timer (0);
-
-  if (atime < dtime)
+  if ((t1 * N) < t2)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   stream = (CUstream) acc_get_cuda_stream (N + 1);
   if (stream != NULL)
@@ -173,35 +147,33 @@ main (int argc, char **argv)
 
   acc_wait (N + 1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.10 < atime)
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t1 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait_all_async (N);
 
   acc_wait (N);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.10 < atime)
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t1 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 3\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
   free (streams);
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
index be30a7f..ecf7488 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
@@ -10,46 +10,18 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay2;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 32;
   int i;
   CUstream *streams;
-  unsigned long **a, **d_a, *tid, ticks;
+  unsigned long **a, **d_a, *tid;
   int nbytes;
-  void *kargs[3];
-  int clkrate;
-  int devnum, nprocs;
+  void *kargs[2];
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,10 +38,6 @@ main (int argc, char **argv)
 
   nbytes = sizeof (int);
 
-  ticks = (unsigned long) (200.0 * clkrate);
-
-  N = nprocs;
-
   streams = (CUstream *) malloc (N * sizeof (void *));
 
   a = (unsigned long **) malloc (N * sizeof (unsigned long *));
@@ -103,8 +71,7 @@ main (int argc, char **argv)
   for (i = 0; i < N; i++)
     {
       kargs[0] = (void *) &d_a[i];
-      kargs[1] = (void *) &ticks;
-      kargs[2] = (void *) &tid[i];
+      kargs[1] = (void *) &tid[i];
 
       r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
       if (r != CUDA_SUCCESS)
@@ -112,8 +79,6 @@ main (int argc, char **argv)
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
 	  abort ();
 	}
-
-      ticks = (unsigned long) (50.0 * clkrate);
     }
 
   acc_wait_all_async (0);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/lib-83.c libgomp/testsuite/libgomp.oacc-c-c++-common/lib-83.c
index 1c2e52b..51b7ee7 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-83.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-83.c
@@ -5,21 +5,19 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <openacc.h>
-#include "timer.h"
+#include <cuda.h>
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  float atime;
   CUstream stream;
   CUresult r;
+  struct timeval tv1, tv2;
+  time_t t1;
 
   acc_init (acc_device_nvidia);
 
-  (void) acc_get_device_num (acc_device_nvidia);
-
-  init_timers (1);
-
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
     abort ();
@@ -34,22 +32,22 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait_all_async (0);
 
   acc_wait (0);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.010 < atime)
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t1 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  fini_timers ();
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-1.c
index fd9df33..9a411fe 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-1.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-1.c
@@ -2,205 +2,5 @@
 
 #include <stdlib.h>
 
-int i;
-
-int main(void)
-{
-  int j, v;
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc parallel /* copyout */ present_or_copyout (v) copyin (i, j)
-  {
-    if (i != -1 || j != -2)
-      abort ();
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-#if ACC_MEM_SHARED
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-#else
-  if (v != 1 || i != -1 || j != -2)
-    abort ();
-#endif
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc parallel /* copyout */ present_or_copyout (v) copyout (i, j)
-  {
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc parallel /* copyout */ present_or_copyout (v) copy (i, j)
-  {
-    if (i != -1 || j != -2)
-      abort ();
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc parallel /* copyout */ present_or_copyout (v) create (i, j)
-  {
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-#if ACC_MEM_SHARED
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-#else
-  if (v != 1 || i != -1 || j != -2)
-    abort ();
-#endif
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc parallel /* copyout */ present_or_copyout (v) present_or_copyin (i, j)
-  {
-    if (i != -1 || j != -2)
-      abort ();
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1)
-    abort ();
-#if ACC_MEM_SHARED
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-#else
-  if (v != 1 || i != -1 || j != -2)
-    abort ();
-#endif
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc parallel /* copyout */ present_or_copyout (v) present_or_copyout (i, j)
-  {
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc parallel /* copyout */ present_or_copyout (v) present_or_copy (i, j)
-  {
-    if (i != -1 || j != -2)
-      abort ();
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-
-  i = -1;
-  j = -2;
-  v = 0;
-#pragma acc parallel /* copyout */ present_or_copyout (v) present_or_create (i, j)
-  {
-    i = 2;
-    j = 1;
-    if (i != 2 || j != 1)
-      abort ();
-    v = 1;
-  }
-  if (v != 1)
-    abort ();
-#if ACC_MEM_SHARED
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-#else
-  if (v != 1 || i != -1 || j != -2)
-    abort ();
-#endif
-
-  i = -1;
-  j = -2;
-  v = 0;
-
-#pragma acc data copyin (i, j)
-  {
-#pragma acc parallel /* copyout */ present_or_copyout (v) present (i, j)
-    {
-      if (i != -1 || j != -2)
-        abort ();
-      i = 2;
-      j = 1;
-      if (i != 2 || j != 1)
-        abort ();
-      v = 1;
-    }
-  }
-#if ACC_MEM_SHARED
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-#else
-  if (v != 1 || i != -1 || j != -2)
-    abort ();
-#endif
-
-  i = -1;
-  j = -2;
-  v = 0;
-
-#pragma acc data copyin(i, j)
-  {
-#pragma acc parallel /* copyout */ present_or_copyout (v)
-    {
-      if (i != -1 || j != -2)
-        abort ();
-      i = 2;
-      j = 1;
-      if (i != 2 || j != 1)
-        abort ();
-      v = 1;
-    }
-  }
-#if ACC_MEM_SHARED
-  if (v != 1 || i != 2 || j != 1)
-    abort ();
-#else
-  if (v != 1 || i != -1 || j != -2)
-    abort ();
-#endif
-
-  return 0;
-}
+#define EXEC_DIRECTIVE parallel
+#include "data-clauses.h"
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/routine-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/routine-1.c
new file mode 100644
index 0000000..a27d076
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/routine-1.c
@@ -0,0 +1,40 @@
+/* FIXME: remove -fno-var-tracking and -fno-exceptions from dg-options.  */
+
+/* { dg-do run } */
+/* { dg-options "-fno-inline -fno-var-tracking -fno-exceptions" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#pragma acc routine
+int
+fact (int n)
+{
+  if (n == 0 || n == 1)
+    return 1;
+
+  return n * fact (n - 1);
+}
+
+int
+main()
+{
+  int *a, i, n = 10;
+
+  a = (int *)malloc (sizeof (int) * n);
+
+#pragma acc parallel copy (a[0:n]) vector_length (5)
+  {
+#pragma acc loop
+    for (i = 0; i < n; i++)
+      a[i] = fact (i);
+  }
+
+  for (i = 0; i < n; i++)
+    if (a[i] != fact (i))
+      abort ();
+
+  free (a);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/routine-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/routine-2.c
new file mode 100644
index 0000000..8ec4d8b
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/routine-2.c
@@ -0,0 +1,41 @@
+/* FIXME: remove -fno-var-tracking and -fno-exceptions from dg-options.  */
+
+/* { dg-do run } */
+/* { dg-options "-fno-inline -fno-var-tracking -fno-exceptions" } */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#pragma acc routine (fact)
+
+
+int fact (int n)
+{
+  if (n == 0 || n == 1)
+    return 1;
+
+  return n * fact (n - 1);
+}
+
+int
+main()
+{
+  int *a, i, n = 10;
+
+  a = (int *)malloc (sizeof (int) * n);
+
+#pragma acc parallel copy (a[0:n]) vector_length (5)
+  {
+#pragma acc loop
+    for (i = 0; i < n; i++)
+      a[i] = fact (i);
+  }
+
+  for (i = 0; i < n; i++)
+    if (a[i] != fact (i))
+      abort ();
+
+  free (a);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
index 9db236c..0c9096f 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
@@ -1,46 +1,24 @@
 
-#if ACC_DEVICE_TYPE_nvidia
-
 #pragma acc routine nohost
-static int clock (void)
-{
-  int thetime;
-
-  asm __volatile__ ("mov.u32 %0, %%clock;" : "=r"(thetime));
-
-  return thetime;
-}
-
-#endif
-
 void
-delay (unsigned long *d_o, unsigned long delay)
+delay ()
 {
-  int start, ticks;
+  int i, sum;
+  const int N = 500000;
 
-  start = clock ();
-
-  ticks = 0;
-
-  while (ticks < delay)
-    ticks = clock () - start;
-
-  return;
+  for (i = 0; i < N; i++)
+    sum = sum + 1;
 }
 
+#pragma acc routine nohost
 void
-delay2 (unsigned long *d_o, unsigned long delay, unsigned long tid)
+delay2 (unsigned long *d_o, unsigned long tid)
 {
-  int start, ticks;
-
-  start = clock ();
-
-  ticks = 0;
+  int i, sum;
+  const int N = 500000;
 
-  while (ticks < delay)
-    ticks = clock () - start;
+  for (i = 0; i < N; i++)
+    sum = sum + 1;
 
   d_o[0] = tid;
-
-  return;
 }
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
index 6f748fc..88b63bf 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
@@ -1,148 +1,90 @@
-// BEGIN PREAMBLE
-	.version	3.1
-	.target	sm_30
+	.version 3.1
+	.target sm_30
 	.address_size 64
-// END PREAMBLE
 
-// BEGIN FUNCTION DEF: clock
-.func (.param.u32 %out_retval)clock
-{
-.reg.u32 %retval;
-	.reg.u64 %hr10;
-	.reg.u32 %r22;
-	.reg.u32 %r23;
-	.reg.u32 %r24;
-	.local.align 8 .b8 %frame[8];
-	// #APP 
-// 7 "subr.c" 1
-	mov.u32 %r24, %clock;
-// 0 "" 2
-	// #NO_APP 
-		st.local.u32	[%frame], %r24;
-		ld.local.u32	%r22, [%frame];
-		mov.u32	%r23, %r22;
-		mov.u32	%retval, %r23;
-	st.param.u32	[%out_retval], %retval;
-	ret;
-	}
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay
-.visible .entry delay(.param.u64 %in_ar1, .param.u64 %in_ar2)
-{
-	.reg.u64 %ar1;
-	.reg.u64 %ar2;
-	.reg.u64 %hr10;
-	.reg.u64 %r22;
-	.reg.u32 %r23;
-	.reg.u64 %r24;
-	.reg.u64 %r25;
-	.reg.u32 %r26;
-	.reg.u32 %r27;
-	.reg.u32 %r28;
-	.reg.u32 %r29;
-	.reg.u32 %r30;
-	.reg.u64 %r31;
-	.reg.pred %r32;
-	.local.align 8 .b8 %frame[24];
-	ld.param.u64 %ar1, [%in_ar1];
-	ld.param.u64 %ar2, [%in_ar2];
-		mov.u64	%r24, %ar1;
-		st.u64	[%frame+8], %r24;
-		mov.u64	%r25, %ar2;
-		st.local.u64	[%frame+16], %r25;
+	.visible .entry delay
 	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r26, [%retval_in];
-}
-		st.local.u32	[%frame+4], %r26;
-		mov.u32	%r27, 0;
-		st.local.u32	[%frame], %r27;
-		bra	$L4;
-$L5:
-	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r28, [%retval_in];
-}
-		mov.u32	%r23, %r28;
-		ld.local.u32	%r30, [%frame+4];
-		sub.u32	%r29, %r23, %r30;
-		st.local.u32	[%frame], %r29;
-$L4:
-		ld.local.s32	%r22, [%frame];
-		ld.local.u64	%r31, [%frame+16];
-		setp.lo.u64 %r32,%r22,%r31;
-	@%r32	bra	$L5;
+	.reg .u64 %hr10;
+	.reg .u32 %r22;
+	.reg .u32 %r23;
+	.reg .u32 %r24;
+	.reg .u32 %r25;
+	.reg .u32 %r26;
+	.reg .u32 %r27;
+	.reg .u32 %r28;
+	.reg .u32 %r29;
+	.reg .pred %r30;
+	.reg .u64 %frame;
+	.local .align 8 .b8 %farray[16];
+	cvta.local.u64 %frame,%farray;
+	mov.u32 %r22,500000;
+	st.u32 [%frame+8],%r22;
+	mov.u32 %r23,0;
+	st.u32 [%frame],%r23;
+	bra $L2;
+	$L3:
+	ld.u32 %r25,[%frame+4];
+	add.u32 %r24,%r25,1;
+	st.u32 [%frame+4],%r24;
+	ld.u32 %r27,[%frame];
+	add.u32 %r26,%r27,1;
+	st.u32 [%frame],%r26;
+	$L2:
+	ld.u32 %r28,[%frame];
+	ld.u32 %r29,[%frame+8];
+	setp.lt.s32 %r30,%r28,%r29;
+	@%r30 
+	bra $L3;
 	ret;
 	}
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay2
-.visible .entry delay2(.param.u64 %in_ar1, .param.u64 %in_ar2, .param.u64 %in_ar3)
-{
-	.reg.u64 %ar1;
-	.reg.u64 %ar2;
-	.reg.u64 %ar3;
-	.reg.u64 %hr10;
-	.reg.u64 %r22;
-	.reg.u32 %r23;
-	.reg.u64 %r24;
-	.reg.u64 %r25;
-	.reg.u64 %r26;
-	.reg.u32 %r27;
-	.reg.u32 %r28;
-	.reg.u32 %r29;
-	.reg.u32 %r30;
-	.reg.u32 %r31;
-	.reg.u64 %r32;
-	.reg.pred %r33;
-	.reg.u64 %r34;
-	.reg.u64 %r35;
-	.local.align 8 .b8 %frame[32];
-	ld.param.u64 %ar1, [%in_ar1];
-	ld.param.u64 %ar2, [%in_ar2];
-	ld.param.u64 %ar3, [%in_ar3];
-		mov.u64	%r24, %ar1;
-		st.local.u64	[%frame+8], %r24;
-		mov.u64	%r25, %ar2;
-		st.local.u64	[%frame+16], %r25;
-		mov.u64	%r26, %ar3;
-		st.local.u64	[%frame+24], %r26;
-	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r27, [%retval_in];
-}
-		st.local.u32	[%frame+4], %r27;
-		mov.u32	%r28, 0;
-		st.local.u32	[%frame], %r28;
-		bra	$L8;
-$L9:
-	{
-		.param.u32 %retval_in;
+
+	.visible .entry delay2 (.param .u64 %in_ar1, .param .u64 %in_ar2)
 	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r29, [%retval_in];
-}
-		mov.u32	%r23, %r29;
-		ld.local.u32	%r31, [%frame+4];
-		sub.u32	%r30, %r23, %r31;
-		st.local.u32	[%frame], %r30;
-$L8:
-		ld.local.s32	%r22, [%frame];
-		ld.local.u64	%r32, [%frame+16];
-		setp.lo.u64 %r33,%r22,%r32;
-	@%r33	bra	$L9;
-		ld.local.u64	%r34, [%frame+8];
-		ld.local.u64	%r35, [%frame+24];
-		st.u64	[%r34], %r35;
+	.reg .u64 %ar1;
+	.reg .u64 %ar2;
+	.reg .u64 %hr10;
+	.reg .u64 %r22;
+	.reg .u64 %r23;
+	.reg .u32 %r24;
+	.reg .u32 %r25;
+	.reg .u32 %r26;
+	.reg .u32 %r27;
+	.reg .u32 %r28;
+	.reg .u32 %r29;
+	.reg .u32 %r30;
+	.reg .u32 %r31;
+	.reg .pred %r32;
+	.reg .u64 %r33;
+	.reg .u64 %r34;
+	.reg .u64 %frame;
+	.local .align 8 .b8 %farray[32];
+	cvta.local.u64 %frame,%farray;
+	ld.param.u64 %ar1,[%in_ar1];
+	ld.param.u64 %ar2,[%in_ar2];
+	mov.u64 %r22,%ar1;
+	st.u64 [%frame+16],%r22;
+	mov.u64 %r23,%ar2;
+	st.u64 [%frame+24],%r23;
+	mov.u32 %r24,500000;
+	st.u32 [%frame+8],%r24;
+	mov.u32 %r25,0;
+	st.u32 [%frame],%r25;
+	bra $L5;
+	$L6:
+	ld.u32 %r27,[%frame+4];
+	add.u32 %r26,%r27,1;
+	st.u32 [%frame+4],%r26;
+	ld.u32 %r29,[%frame];
+	add.u32 %r28,%r29,1;
+	st.u32 [%frame],%r28;
+	$L5:
+	ld.u32 %r30,[%frame];
+	ld.u32 %r31,[%frame+8];
+	setp.lt.s32 %r32,%r30,%r31;
+	@%r32 
+	bra $L6;
+	ld.u64 %r33,[%frame+16];
+	ld.u64 %r34,[%frame+24];
+	st.u64 [%r33],%r34;
 	ret;
 	}
-// END FUNCTION DEF
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
deleted file mode 100644
index 53749da..0000000
--- libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
+++ /dev/null
@@ -1,103 +0,0 @@
-
-#include <stdio.h>
-#include <cuda.h>
-
-static int _Tnum_timers;
-static CUevent *_Tstart_events, *_Tstop_events;
-static CUstream _Tstream;
-
-void
-init_timers (int ntimers)
-{
-  int i;
-  CUresult r;
-
-  _Tnum_timers = ntimers;
-
-  _Tstart_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
-  _Tstop_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
-
-  r = cuStreamCreate (&_Tstream, CU_STREAM_DEFAULT);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
-      abort ();
-    }
-
-  for (i = 0; i < _Tnum_timers; i++)
-    {
-      r = cuEventCreate (&_Tstart_events[i], CU_EVENT_DEFAULT);
-      if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuEventCreate failed: %d\n", r);
-	  abort ();
-	}
-
-      r = cuEventCreate (&_Tstop_events[i], CU_EVENT_DEFAULT);
-      if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuEventCreate failed: %d\n", r);
-	  abort ();
-	}
-    }
-}
-
-void
-fini_timers (void)
-{
-  int i;
-
-  for (i = 0; i < _Tnum_timers; i++)
-    {
-      cuEventDestroy (_Tstart_events[i]);
-      cuEventDestroy (_Tstop_events[i]);
-    }
-
-  cuStreamDestroy (_Tstream);
-
-  free (_Tstart_events);
-  free (_Tstop_events);
-}
-
-void
-start_timer (int timer)
-{
-  CUresult r;
-
-  r = cuEventRecord (_Tstart_events[timer], _Tstream);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventRecord failed: %d\n", r);
-      abort ();
-    }
-}
-
-float
-stop_timer (int timer)
-{
-  CUresult r;
-  float etime;
-
-  r = cuEventRecord (_Tstop_events[timer], _Tstream);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventRecord failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuEventSynchronize (_Tstop_events[timer]);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventSynchronize failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuEventElapsedTime (&etime, _Tstart_events[timer], _Tstop_events[timer]);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventElapsedTime failed: %d\n", r);
-      abort ();
-    }
-
-  return etime;
-}
diff --git libgomp/testsuite/libgomp.oacc-fortran/atomic_capture-1.f90 libgomp/testsuite/libgomp.oacc-fortran/atomic_capture-1.f90
new file mode 100644
index 0000000..27c5c9e
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/atomic_capture-1.f90
@@ -0,0 +1,784 @@
+! { dg-do run }
+
+program main
+  integer igot, iexp, itmp
+  real fgot, fexp, ftmp
+  logical lgot, lexp, ltmp
+  integer, parameter :: N = 32
+
+  igot = 0
+  iexp = N * 2
+
+  !$acc parallel copy (igot, itmp)
+    do i = 1, N
+  !$acc atomic capture
+      itmp = igot
+      igot = i + i
+  !$acc end atomic
+    end do
+  !$acc end parallel
+
+  if (igot /= iexp) call abort
+  if (itmp /= iexp - 2) call abort
+
+  fgot = 1234.0
+  fexp = 1266.0
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      ftmp = fgot
+      fgot = fgot + 1.0
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp - 1.0) call abort
+  if (fgot /= fexp) call abort
+
+  fgot = 1.0
+  fexp = 2.0**32
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      ftmp = fgot
+      fgot = fgot * 2.0
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp / 2.0) call abort
+  if (fgot /= fexp) call abort
+
+  fgot = 32.0
+  fexp = fgot - N
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      ftmp = fgot
+      fgot = fgot - 1.0
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp + 1.0) call abort
+  if (fgot /= fexp) call abort
+
+  fgot = 2**32.0
+  fexp = 1.0
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      ftmp = fgot
+      fgot = fgot / 2.0
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fgot * 2.0) call abort
+  if (fgot /= fexp) call abort
+
+  lgot = .TRUE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    ltmp = lgot
+    lgot = lgot .and. .FALSE.
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. .not. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    ltmp = lgot
+    lgot = lgot .or. .FALSE.
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    ltmp = lgot
+    lgot = lgot .eqv. .TRUE.
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .TRUE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    ltmp = lgot
+    lgot = lgot .neqv. .TRUE.
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. .not. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  fgot = 1234.0
+  fexp = 1266.0
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      ftmp = fgot
+      fgot = 1.0 + fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp - 1.0) call abort 
+  if (fgot /= fexp) call abort
+
+  fgot = 1.0
+  fexp = 2.0**32
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      ftmp = fgot
+      fgot = 2.0 * fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp / 2.0) call abort
+  if (fgot /= fexp) call abort
+
+  fgot = 32.0
+  fexp = 32.0
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      ftmp = fgot
+      fgot = 2.0 - fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= 2.0 - fexp) call abort
+  if (fgot /= fexp) call abort
+
+  fgot = 2.0**16
+  fexp = 2.0**16
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      ftmp = fgot
+      fgot = 2.0 / fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= 2.0 / fexp) call abort
+  if (fgot /= fexp) call abort
+
+  lgot = .TRUE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    ltmp = lgot
+    lgot = .FALSE. .and. lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. .not. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    ltmp = lgot
+    lgot = .FALSE. .or. lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    ltmp = lgot
+    lgot = .TRUE. .eqv. lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .TRUE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    ltmp = lgot
+    lgot = .TRUE. .neqv. lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. .not. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  igot = 1
+  iexp = N
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 1, N
+  !$acc atomic capture
+      itmp = igot
+      igot = max (igot, i)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp - 1) call abort
+  if (igot /= iexp) call abort
+
+  igot = N
+  iexp = 1
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 1, N
+  !$acc atomic capture
+      itmp = igot
+      igot = min (igot, i)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = ibclr (-2, i)
+  !$acc atomic capture
+      itmp = igot
+      igot = iand (igot, iexpr)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= ibset (iexp, N - 1)) call abort
+  if (igot /= iexp) call abort
+
+  igot = 0
+  iexp = -1 
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = lshift (1, i)
+  !$acc atomic capture
+      itmp = igot
+      igot = ior (igot, iexpr)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= ieor (iexp, lshift (1, N - 1))) call abort
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0 
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = lshift (1, i)
+  !$acc atomic capture
+      itmp = igot
+      igot = ieor (igot, iexpr)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= ior (iexp, lshift (1, N - 1))) call abort
+  if (igot /= iexp) call abort
+
+  igot = 1
+  iexp = N
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 1, N
+  !$acc atomic capture
+      itmp = igot
+      igot = max (i, igot)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp - 1) call abort
+  if (igot /= iexp) call abort
+
+  igot = N
+  iexp = 1
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 1, N
+  !$acc atomic capture
+      itmp = igot
+      igot = min (i, igot)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = ibclr (-2, i)
+  !$acc atomic capture
+      itmp = igot
+      igot = iand (iexpr, igot)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= ibset (iexp, N - 1)) call abort
+  if (igot /= iexp) call abort
+
+  igot = 0
+  iexp = -1 
+	!!
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = lshift (1, i)
+  !$acc atomic capture
+      itmp = igot
+      igot = ior (iexpr, igot)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= ieor (iexp, lshift (1, N - 1))) call abort
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0 
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = lshift (1, i)
+  !$acc atomic capture
+      itmp = igot
+      igot = ieor (iexpr, igot)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= ior (iexp, lshift (1, N - 1))) call abort
+  if (igot /= iexp) call abort
+
+  fgot = 1234.0
+  fexp = 1266.0
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      fgot = fgot + 1.0
+      ftmp = fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp) call abort
+  if (fgot /= fexp) call abort
+
+  fgot = 1.0
+  fexp = 2.0**32
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      fgot = fgot * 2.0
+      ftmp = fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp) call abort
+  if (fgot /= fexp) call abort
+
+  fgot = 32.0
+  fexp = fgot - N
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      fgot = fgot - 1.0
+      ftmp = fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp) call abort
+  if (fgot /= fexp) call abort
+
+  fgot = 2**32.0
+  fexp = 1.0
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      fgot = fgot / 2.0
+      ftmp = fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp) call abort
+  if (fgot /= fexp) call abort
+
+  lgot = .TRUE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    lgot = lgot .and. .FALSE.
+    ltmp = lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    lgot = lgot .or. .FALSE.
+    ltmp = lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    lgot = lgot .eqv. .TRUE.
+    ltmp = lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .TRUE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    lgot = lgot .neqv. .TRUE.
+    ltmp = lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  fgot = 1234.0
+  fexp = 1266.0
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      fgot = 1.0 + fgot
+      ftmp = fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp) call abort
+  if (fgot /= fexp) call abort
+
+  fgot = 1.0
+  fexp = 2.0**32
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      fgot = 2.0 * fgot
+      ftmp = fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp) call abort
+  if (fgot /= fexp) call abort
+
+  fgot = 32.0
+  fexp = 32.0
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      fgot = 2.0 - fgot
+      ftmp = fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp) call abort
+  if (fgot /= fexp) call abort
+
+  fgot = 2.0**16
+  fexp = 2.0**16
+
+  !$acc parallel loop copy (fgot, ftmp)
+    do i = 1, N
+  !$acc atomic capture
+      fgot = 2.0 / fgot
+      ftmp = fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (ftmp /= fexp) call abort
+  if (fgot /= fexp) call abort
+
+  lgot = .TRUE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    lgot = .FALSE. .and. lgot
+    ltmp = lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    lgot = .FALSE. .or. lgot
+    ltmp = lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    lgot = .TRUE. .eqv. lgot
+    ltmp = lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .TRUE.
+
+  !$acc parallel copy (lgot, ltmp)
+  !$acc atomic capture
+    lgot = .TRUE. .neqv. lgot
+    ltmp = lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (ltmp .neqv. lexp) call abort
+  if (lgot .neqv. lexp) call abort
+
+  igot = 1
+  iexp = N
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 1, N
+  !$acc atomic capture
+      igot = max (igot, i)
+      itmp = igot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+  igot = N
+  iexp = 1
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 1, N
+  !$acc atomic capture
+      igot = min (igot, i)
+      itmp = igot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = ibclr (-2, i)
+  !$acc atomic capture
+      igot = iand (igot, iexpr)
+      itmp = igot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+  igot = 0
+  iexp = -1 
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = lshift (1, i)
+  !$acc atomic capture
+      igot = ior (igot, iexpr)
+      itmp = igot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0 
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = lshift (1, i)
+  !$acc atomic capture
+      igot = ieor (igot, iexpr)
+      itmp = igot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+  igot = 1
+  iexp = N
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 1, N
+  !$acc atomic capture
+      igot = max (i, igot)
+      itmp = igot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+  igot = N
+  iexp = 1
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 1, N
+  !$acc atomic capture
+      igot = min (i, igot)
+      itmp = igot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = ibclr (-2, i)
+  !$acc atomic capture
+      igot = iand (iexpr, igot)
+      itmp = igot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+  igot = 0
+  iexp = -1 
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = lshift (1, i)
+  !$acc atomic capture
+      igot = ior (iexpr, igot)
+      itmp = igot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0 
+
+  !$acc parallel loop copy (igot, itmp)
+    do i = 0, N - 1
+      iexpr = lshift (1, i)
+  !$acc atomic capture
+      igot = ieor (iexpr, igot)
+      itmp = igot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (itmp /= iexp) call abort
+  if (igot /= iexp) call abort
+
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/atomic_update-1.f90 libgomp/testsuite/libgomp.oacc-fortran/atomic_update-1.f90
new file mode 100644
index 0000000..6607c77
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/atomic_update-1.f90
@@ -0,0 +1,338 @@
+! { dg-do run }
+
+program main
+  integer igot, iexp, iexpr
+  real fgot, fexp
+  integer i
+  integer, parameter :: N = 32
+  logical lgot, lexp
+
+  fgot = 1234.0
+  fexp = 1266.0
+
+  !$acc parallel loop copy (fgot)
+    do i = 1, N
+  !$acc atomic update
+      fgot = fgot + 1.0
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (fgot /= fexp) call abort
+
+  fgot = 1.0
+  fexp = 2.0**32
+
+  !$acc parallel loop copy (fgot)
+    do i = 1, N
+  !$acc atomic update
+      fgot = fgot * 2.0
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (fgot /= fexp) call abort
+
+  fgot = 32.0
+  fexp = fgot - N
+
+  !$acc parallel loop copy (fgot)
+    do i = 1, N
+  !$acc atomic update
+      fgot = fgot - 1.0
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (fgot /= fexp) call abort
+
+  fgot = 2**32.0
+  fexp = 1.0
+
+  !$acc parallel loop copy (fgot)
+    do i = 1, N
+  !$acc atomic update
+      fgot = fgot / 2.0
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (fgot /= fexp) call abort
+
+  lgot = .TRUE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot)
+  !$acc atomic update
+    lgot = lgot .and. .FALSE.
+  !$acc end atomic
+  !$acc end parallel
+
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot)
+  !$acc atomic update
+    lgot = lgot .or. .FALSE.
+  !$acc end atomic
+  !$acc end parallel
+
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot)
+  !$acc atomic update
+    lgot = lgot .eqv. .TRUE.
+  !$acc end atomic
+  !$acc end parallel
+
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .TRUE.
+
+  !$acc parallel copy (lgot)
+  !$acc atomic update
+    lgot = lgot .neqv. .TRUE.
+  !$acc end atomic
+  !$acc end parallel
+
+  if (lgot .neqv. lexp) call abort
+
+  fgot = 1234.0
+  fexp = 1266.0
+
+  !$acc parallel loop copy (fgot)
+    do i = 1, N
+  !$acc atomic update
+      fgot = 1.0 + fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (fgot /= fexp) call abort
+
+  fgot = 1.0
+  fexp = 2.0**32
+
+  !$acc parallel loop copy (fgot)
+    do i = 1, N
+  !$acc atomic update
+      fgot = 2.0 * fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (fgot /= fexp) call abort
+
+  fgot = 32.0
+  fexp = 32.0
+
+  !$acc parallel loop copy (fgot)
+    do i = 1, N
+  !$acc atomic update
+      fgot = 2.0 - fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (fgot /= fexp) call abort
+
+  fgot = 2.0**16
+  fexp = 2.0**16
+
+  !$acc parallel loop copy (fgot)
+    do i = 1, N
+  !$acc atomic update
+      fgot = 2.0 / fgot
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (fgot /= fexp) call abort
+
+  lgot = .TRUE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot)
+  !$acc atomic update
+    lgot = .FALSE. .and. lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot)
+  !$acc atomic update
+    lgot = .FALSE. .or. lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .FALSE.
+
+  !$acc parallel copy (lgot)
+  !$acc atomic update
+    lgot = .TRUE. .eqv. lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (lgot .neqv. lexp) call abort
+
+  lgot = .FALSE.
+  lexp = .TRUE.
+
+  !$acc parallel copy (lgot)
+  !$acc atomic update
+    lgot = .TRUE. .neqv. lgot
+  !$acc end atomic
+  !$acc end parallel
+
+  if (lgot .neqv. lexp) call abort
+
+  igot = 1
+  iexp = N
+
+  !$acc parallel loop copy (igot)
+    do i = 1, N
+  !$acc atomic update
+      igot = max (igot, i)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (igot /= iexp) call abort
+
+  igot = N
+  iexp = 1
+
+  !$acc parallel loop copy (igot)
+    do i = 1, N
+  !$acc atomic update
+      igot = min (igot, i)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0
+
+  !$acc parallel loop copy (igot)
+    do i = 0, N - 1
+      iexpr = ibclr (-2, i)
+  !$acc atomic update
+      igot = iand (igot, iexpr)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (igot /= iexp) call abort
+
+  igot = 0
+  iexp = -1 
+
+  !$acc parallel loop copy (igot)
+    do i = 0, N - 1
+      iexpr = lshift (1, i)
+  !$acc atomic update
+      igot = ior (igot, iexpr)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0 
+
+  !$acc parallel loop copy (igot)
+    do i = 0, N - 1
+      iexpr = lshift (1, i)
+  !$acc atomic update
+      igot = ieor (igot, iexpr)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (igot /= iexp) call abort
+
+  igot = 1
+  iexp = N
+
+  !$acc parallel loop copy (igot)
+    do i = 1, N
+  !$acc atomic update
+      igot = max (i, igot)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (igot /= iexp) call abort
+
+  igot = N
+  iexp = 1
+
+  !$acc parallel loop copy (igot)
+    do i = 1, N
+  !$acc atomic update
+      igot = min (i, igot)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0
+
+  !$acc parallel loop copy (igot)
+    do i = 0, N - 1
+      iexpr = ibclr (-2, i)
+  !$acc atomic update
+      igot = iand (iexpr, igot)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (igot /= iexp) call abort
+
+  igot = 0
+  iexp = -1 
+
+  !$acc parallel loop copy (igot)
+    do i = 0, N - 1
+        iexpr = lshift (1, i)
+  !$acc atomic update
+      igot = ior (iexpr, igot)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (igot /= iexp) call abort
+
+  igot = -1
+  iexp = 0 
+
+  !$acc parallel loop copy (igot)
+    do i = 0, N - 1
+      iexpr = lshift (1, i)
+  !$acc atomic update
+      igot = ieor (iexpr, igot)
+  !$acc end atomic
+    end do
+  !$acc end parallel loop
+
+  if (igot /= iexp) call abort
+
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/cache-1.f90 libgomp/testsuite/libgomp.oacc-fortran/cache-1.f90
new file mode 100644
index 0000000..f01b8e9
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/cache-1.f90
@@ -0,0 +1,26 @@
+
+program main
+    integer, parameter :: N = 8
+    integer, dimension (N) :: a, b
+    integer :: i
+    integer :: idx, len
+
+    idx = 1
+    len = 2
+
+    !$acc parallel copyin (a(1:N)) copyout (b(1:N))
+        do i = 1, N
+
+    !$acc cache (a(1:N))
+    !$acc cache (a(0:N))
+    !$acc cache (a(0:N), b(0:N))
+    !$acc cache (a(0))
+    !$acc cache (a(0), a(1), b(0:N))
+    !$acc cache (a(idx))
+    !$acc cache (a(idx:len))
+
+            b(i) = a(i)
+        end do
+    !$acc end parallel
+
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/clauses-1.f90 libgomp/testsuite/libgomp.oacc-fortran/clauses-1.f90
new file mode 100644
index 0000000..e6ab78d
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/clauses-1.f90
@@ -0,0 +1,290 @@
+! { dg-do run }
+! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } }
+
+program main
+  use openacc
+  implicit none
+
+  integer, parameter :: N = 32
+  real, allocatable :: a(:), b(:), c(:)
+  integer i
+
+  i = 0
+
+  allocate (a(N))
+  allocate (b(N))
+  allocate (c(N))
+
+  a(:) = 3.0
+  b(:) = 0.0
+
+  !$acc parallel copyin (a(1:N)) copyout (b(1:N))
+    do i = 1, N
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (b(i) .ne. 3.0) call abort
+  end do
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+
+  a(:) = 5.0
+  b(:) = 1.0
+
+  !$acc parallel copyin (a(1:N)) copyout (b(1:N))
+    do i = 1, N
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (b(i) .ne. 5.0) call abort
+  end do
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+
+  a(:) = 6.0
+  b(:) = 0.0
+
+  call acc_copyin (a, sizeof (a))
+
+  a(:) = 9.0
+
+  !$acc parallel present_or_copyin (a(1:N)) copyout (b(1:N))
+    do i = 1, N
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (b(i) .ne. 6.0) call abort
+  end do
+
+  call acc_copyout (a, sizeof (a))
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+
+  a(:) = 6.0
+  b(:) = 0.0
+
+  !$acc parallel copyin (a(1:N)) present_or_copyout (b(1:N))
+     do i = 1, N
+       b(i) = a(i)
+     end do
+  !$acc end parallel
+
+  do i = 1, N
+     if (b(i) .ne. 6.0) call abort
+  end do
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+
+  a(:) = 5.0
+  b(:) = 2.0
+
+  call acc_copyin (b, sizeof (b))
+
+  !$acc parallel copyin (a(1:N)) present_or_copyout (b(1:N))
+     do i = 1, N
+       b(i) = a(i)
+     end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (a(i) .ne. 5.0) call abort
+    if (b(i) .ne. 2.0) call abort
+  end do
+
+  call acc_copyout (b, sizeof (b))
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+
+  a(:) = 3.0;
+  b(:) = 4.0;
+
+  !$acc parallel copy (a(1:N)) copyout (b(1:N))
+    do i = 1, N
+      a(i) = a(i) + 1
+      b(i) = a(i) + 2
+    end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (a(i) .ne. 4.0) call abort
+    if (b(i) .ne. 6.0) call abort
+  end do
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+
+  a(:) = 4.0
+  b(:) = 7.0
+
+  !$acc parallel present_or_copy (a(1:N)) present_or_copy (b(1:N))
+    do i = 1, N
+      a(i) = a(i) + 1
+      b(i) = b(i) + 2
+    end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (a(i) .ne. 5.0) call abort
+    if (b(i) .ne. 9.0) call abort
+  end do
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+
+  a(:) = 3.0
+  b(:) = 7.0
+
+  call acc_copyin (a, sizeof (a))
+  call acc_copyin (b, sizeof (b))
+
+  !$acc parallel present_or_copy (a(1:N)) present_or_copy (b(1:N))
+    do i = 1, N
+      a(i) = a(i) + 1
+      b(i) = b(i) + 2
+    end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (a(i) .ne. 3.0) call abort
+    if (b(i) .ne. 7.0) call abort
+  end do
+
+  call acc_copyout (a, sizeof (a))
+  call acc_copyout (b, sizeof (b))
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+
+  a(:) = 3.0
+  b(:) = 7.0
+
+  !$acc parallel copyin (a(1:N)) create (c(1:N)) copyout (b(1:N))
+    do i = 1, N
+      c(i) = a(i)
+      b(i) = c(i)
+    end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (a(i) .ne. 3.0) call abort
+    if (b(i) .ne. 3.0) call abort
+  end do
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+  if (acc_is_present (c) .eqv. .TRUE.) call abort
+
+  a(:) = 4.0
+  b(:) = 8.0
+
+  !$acc parallel copyin (a(1:N)) present_or_create (c(1:N)) copyout (b(1:N))
+    do i = 1, N
+      c(i) = a(i)
+      b(i) = c(i)
+    end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (a(i) .ne. 4.0) call abort
+    if (b(i) .ne. 4.0) call abort
+  end do
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+  if (acc_is_present (c) .eqv. .TRUE.) call abort
+
+  a(:) = 4.0
+
+  call acc_copyin (a, sizeof (a))
+  call acc_copyin (b, sizeof (b))
+  call acc_copyin (c, sizeof (c))
+
+  !$acc parallel present (a(1:N)) present (c(1:N)) present (b(1:N))
+    do i = 1, N
+      c(i) = a(i)
+      b(i) = c(i)
+    end do
+  !$acc end parallel
+
+  call acc_copyout (a, sizeof (a))
+  call acc_copyout (b, sizeof (b))
+  call acc_copyout (c, sizeof (c))
+  
+  do i = 1, N
+    if (a(i) .ne. 4.0) call abort
+    if (b(i) .ne. 4.0) call abort
+  end do
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+  if (acc_is_present (c) .eqv. .TRUE.) call abort
+
+  a(:) = 6.0
+  b(:) = 0.0
+
+  call acc_copyin (a, sizeof (a))
+
+  a(:) = 9.0
+
+  !$acc parallel pcopyin (a(1:N)) copyout (b(1:N))
+    do i = 1, N
+      b(i) = a(i)
+    end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (b(i) .ne. 6.0) call abort
+  end do
+  
+  call acc_copyout (a, sizeof (a))
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+
+  a(:) = 6.0
+  b(:) = 0.0
+
+  !$acc parallel copyin (a(1:N)) pcopyout (b(1:N))
+   do i = 1, N
+     b(i) = a(i)
+   end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (b(i) .ne. 6.0) call abort
+  end do
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+
+  a(:) = 5.0
+  b(:) = 7.0
+
+  !$acc parallel copyin (a(1:N)) pcreate (c(1:N)) copyout (b(1:N))
+    do i = 1, N
+      c(i) = a(i)
+      b(i) = c(i)
+    end do
+  !$acc end parallel
+
+  do i = 1, N
+    if (a(i) .ne. 5.0) call abort
+    if (b(i) .ne. 5.0) call abort
+  end do
+
+  if (acc_is_present (a) .eqv. .TRUE.) call abort
+  if (acc_is_present (b) .eqv. .TRUE.) call abort
+  if (acc_is_present (c) .eqv. .TRUE.) call abort
+
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/data-1.f90 libgomp/testsuite/libgomp.oacc-fortran/data-1.f90
index 5e94e2d..bf323b3 100644
--- libgomp/testsuite/libgomp.oacc-fortran/data-1.f90
+++ libgomp/testsuite/libgomp.oacc-fortran/data-1.f90
@@ -1,45 +1,212 @@
 ! { dg-do run }
+! { dg-additional-options "-cpp" }
 
-program test
-  integer, parameter :: N = 8
-  real, allocatable :: a(:), b(:)
+function is_mapped (n) result (rc)
+  use openacc
 
-  allocate (a(N))
-  allocate (b(N))
+  integer, intent (in) :: n
+  logical rc
 
-  a(:) = 3.0
-  b(:) = 0.0
+#if ACC_MEM_SHARED
+  integer i
 
-  !$acc enter data copyin (a(1:N), b(1:N))
+  rc = .TRUE.
+  i = n
+#else
+  rc = acc_is_present (n, sizeof (n))
+#endif
 
-  !$acc parallel
-  do i = 1, n
-    b(i) = a (i)
-  end do
-  !$acc end parallel
+end function is_mapped
 
-  !$acc exit data copyout (a(1:N), b(1:N))
+program main
+  integer i, j
+  logical is_mapped
 
-  do i = 1, n
-    if (a(i) .ne. 3.0) call abort
-    if (b(i) .ne. 3.0) call abort
-  end do
+  i = -1
+  j = -2
 
-  a(:) = 5.0
-  b(:) = 1.0
+  !$acc data copyin (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
 
-  !$acc enter data copyin (a(1:N), b(1:N))
+    if (i .ne. -1 .or. j .ne. -2) call abort
 
-  !$acc parallel
-  do i = 1, n
-    b(i) = a (i)
-  end do
-  !$acc end parallel
+    i = 2
+    j = 1
 
-  !$acc exit data copyout (a(1:N), b(1:N))
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
 
-  do i = 1, n
-    if (a(i) .ne. 5.0) call abort
-    if (b(i) .ne. 5.0) call abort
-  end do
-end program test
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data copyout (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+
+    !$acc parallel present (i, j)
+      i = 4
+      j = 2
+    !$acc end parallel
+  !$acc end data
+
+  if (i .ne. 4 .or. j .ne. 2) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data create (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data present_or_copyin (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data present_or_copyout (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+
+    !$acc parallel present (i, j)
+      i = 4
+      j = 2
+    !$acc end parallel
+  !$acc end data
+
+  if (i .ne. 4 .or. j .ne. 2) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data present_or_copy (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
+
+#if ACC_MEM_SHARED
+  if (i .ne. 2 .or. j .ne. 1) call abort
+#else
+  if (i .ne. -1 .or. j .ne. -2) call abort
+#endif
+
+  i = -1
+  j = -2
+
+  !$acc data present_or_create (i, j)
+    if (is_mapped (i) .eqv. .FALSE.) call abort
+    if (is_mapped (j) .eqv. .FALSE.) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data copyin (i, j)
+    !$acc data present (i, j)
+      if (is_mapped (i) .eqv. .FALSE.) call abort
+      if (is_mapped (j) .eqv. .FALSE.) call abort
+
+      if (i .ne. -1 .or. j .ne. -2) call abort
+
+      i = 2
+      j = 1
+
+      if (i .ne. 2 .or. j .ne. 1) call abort
+    !$acc end data
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data copyin (i, j)
+    !$acc data present (i, j)
+      if (is_mapped (i) .eqv. .FALSE.) call abort
+      if (is_mapped (j) .eqv. .FALSE.) call abort
+
+      if (i .ne. -1 .or. j .ne. -2) call abort
+
+      i = 2
+      j = 1
+
+      if (i .ne. 2 .or. j .ne. 1) call abort
+    !$acc end data
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+  i = -1
+  j = -2
+
+  !$acc data
+#if !ACC_MEM_SHARED
+    if (is_mapped (i) .eqv. .TRUE.) call abort
+    if (is_mapped (j) .eqv. .TRUE.) call abort
+#endif
+    if (i .ne. -1 .or. j .ne. -2) call abort
+
+    i = 2
+    j = 1
+
+    if (i .ne. 2 .or. j .ne. 1) call abort
+  !$acc end data
+
+  if (i .ne. 2 .or. j .ne. 1) call abort
+
+end program main
diff --git libgomp/testsuite/libgomp.oacc-fortran/data-2.f90 libgomp/testsuite/libgomp.oacc-fortran/data-2.f90
index 8736c2a..d190700 100644
--- libgomp/testsuite/libgomp.oacc-fortran/data-2.f90
+++ libgomp/testsuite/libgomp.oacc-fortran/data-2.f90
@@ -1,8 +1,14 @@
 ! { dg-do run }
 
 program test
+  use openacc
   integer, parameter :: N = 8
   real, allocatable :: a(:,:), b(:,:)
+  real, allocatable :: c(:), d(:)
+  integer i, j
+
+  i = 0
+  j = 0
 
   allocate (a(N,N))
   allocate (b(N,N))
@@ -28,4 +34,48 @@ program test
       if (b(j,i) .ne. 3.0) call abort
     end do
   end do
+
+  allocate (c(N))
+  allocate (d(N))
+
+  c(:) = 3.0
+  d(:) = 0.0
+
+  !$acc enter data copyin (c(1:N)) create (d(1:N)) async
+  !$acc wait
+  
+  !$acc parallel 
+    do i = 1, N
+      d(i) = c(i) + 1
+    end do
+  !$acc end parallel
+
+  !$acc exit data copyout (c(1:N), d(1:N)) async
+  !$acc wait
+
+  do i = 1, N
+    if (d(i) .ne. 4.0) call abort
+  end do
+
+  c(:) = 3.0
+  d(:) = 0.0
+
+  !$acc enter data copyin (c(1:N)) async
+  !$acc enter data create (d(1:N)) wait
+  !$acc wait
+
+  !$acc parallel 
+    do i = 1, N
+      d(i) = c(i) + 1
+    end do
+  !$acc end parallel
+  
+  !$acc exit data copyout (d(1:N)) async
+  !$acc exit data async
+  !$acc wait
+
+  do i = 1, N
+    if (d(i) .ne. 4.0) call abort
+  end do
+
 end program test
diff --git libgomp/testsuite/libgomp.oacc-fortran/data-3.f90 libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
index 9868cb0..daf20a5 100644
--- libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
+++ libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
@@ -17,7 +17,7 @@ program asyncwait
 
   !$acc enter data copyin (a(1:N)) copyin (b(1:N)) copyin (N) async
 
-  !$acc parallel async wait
+  !$acc parallel async wait present (a(1:N)) present (b(1:N)) present (N)
   do i = 1, N
      b(i) = a(i)
   end do
@@ -36,7 +36,7 @@ program asyncwait
 
   !$acc enter data copyin (a(1:N)) copyin (b(1:N)) async (1)
 
-  !$acc parallel async (1) wait (1)
+  !$acc parallel async (1) wait (1) present (a(1:N), b(1:N), N)
   do i = 1, N
      b(i) = a(i)
   end do
@@ -55,28 +55,30 @@ program asyncwait
   c(:) = 0.0
   d(:) = 0.0
 
-  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N))
+  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+  !$acc& create (d(1:N))
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), N)
   do i = 1, N
      b(i) = (a(i) * a(i) * a(i)) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), N)
   do i = 1, N
      c(i) = (a(i) * 4) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N), N)
   do i = 1, N
      d(i) = ((a(i) * a(i)  + a(i)) / a(i)) - a(i)
   end do
   !$acc end parallel
 
   !$acc wait (1)
-  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N))
+  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+  !$acc& copyout (d(1:N))
 
   do i = 1, N
      if (a(i) .ne. 3.0) call abort
@@ -91,34 +93,40 @@ program asyncwait
   d(:) = 0.0
   e(:) = 0.0
 
-  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N)) copyin (e(1:N))
+  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+  !$acc& create (d(1:N)) copyin (e(1:N))
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N)) &
+  !$acc& present (e(1:N), N)
   do i = 1, N
      b(i) = (a(i) * a(i) * a(i)) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N)) &
+  !$acc& present (e(1:N), N)
   do i = 1, N
      c(i) = (a(i) * 4) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), c(1:N), d(1:N)) &
+  !$acc& present (e(1:N), N)
   do i = 1, N
      d(i) = ((a(i) * a(i) + a(i)) / a(i)) - a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel wait (1) async (1)
+  !$acc parallel wait (1) async (1) present (a(1:N), b(1:N), c(1:N)) &
+  !$acc& present (d(1:N), e(1:N), N)
   do i = 1, N
      e(i) = a(i) + b(i) + c(i) + d(i)
   end do
   !$acc end parallel
 
   !$acc wait (1)
-  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N)) copyout (e(1:N))
+  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+  !$acc& copyout (d(1:N)) copyout (e(1:N))
   !$acc exit data delete (N)
 
   do i = 1, N
diff --git libgomp/testsuite/libgomp.oacc-fortran/data-4-2.f90 libgomp/testsuite/libgomp.oacc-fortran/data-4-2.f90
index 16a8598..d1ecf0a 100644
--- libgomp/testsuite/libgomp.oacc-fortran/data-4-2.f90
+++ libgomp/testsuite/libgomp.oacc-fortran/data-4-2.f90
@@ -19,7 +19,7 @@ program asyncwait
 
   !$acc enter data copyin (a(1:N)) copyin (b(1:N)) copyin (N) async
 
-  !$acc parallel async wait
+  !$acc parallel async wait present (a(1:N), b(1:N), N)
   !$acc loop
   do i = 1, N
      b(i) = a(i)
@@ -39,7 +39,7 @@ program asyncwait
 
   !$acc update device (a(1:N), b(1:N)) async (1)
 
-  !$acc parallel async (1) wait (1)
+  !$acc parallel async (1) wait (1) present (a(1:N), b(1:N), N)
   !$acc loop
   do i = 1, N
      b(i) = a(i)
@@ -62,19 +62,19 @@ program asyncwait
   !$acc enter data copyin (c(1:N), d(1:N)) async (1)
   !$acc update device (a(1:N), b(1:N)) async (1)
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), N)
   do i = 1, N
      b(i) = (a(i) * a(i) * a(i)) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), c(1:N), N)
   do i = 1, N
      c(i) = (a(i) * 4) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), d(1:N), N)
   do i = 1, N
      d(i) = ((a(i) * a(i)  + a(i)) / a(i)) - a(i)
   end do
@@ -100,25 +100,26 @@ program asyncwait
   !$acc enter data copyin (e(1:N)) async (1)
   !$acc update device (a(1:N), b(1:N), c(1:N), d(1:N)) async (1)
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), N)
   do i = 1, N
      b(i) = (a(i) * a(i) * a(i)) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), c(1:N), N)
   do i = 1, N
      c(i) = (a(i) * 4) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), d(1:N), N)
   do i = 1, N
      d(i) = ((a(i) * a(i) + a(i)) / a(i)) - a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel wait (1) async (1)
+  !$acc parallel wait (1) async (1) present (a(1:N), b(1:N), c(1:N)) &
+  !$acc& present (d(1:N), e(1:N), N)
   do i = 1, N
      e(i) = a(i) + b(i) + c(i) + d(i)
   end do
diff --git libgomp/testsuite/libgomp.oacc-fortran/data-4.f90 libgomp/testsuite/libgomp.oacc-fortran/data-4.f90
index f6886b0..4e95a9c 100644
--- libgomp/testsuite/libgomp.oacc-fortran/data-4.f90
+++ libgomp/testsuite/libgomp.oacc-fortran/data-4.f90
@@ -17,7 +17,7 @@ program asyncwait
 
   !$acc enter data copyin (a(1:N)) copyin (b(1:N)) copyin (N) async
 
-  !$acc parallel async wait
+  !$acc parallel async wait present (a(1:N), b(1:N), N)
   !$acc loop
   do i = 1, N
      b(i) = a(i)
@@ -37,7 +37,7 @@ program asyncwait
 
   !$acc update device (a(1:N), b(1:N)) async (1)
 
-  !$acc parallel async (1) wait (1)
+  !$acc parallel async (1) wait (1) present (a(1:N), b(1:N), N)
   !$acc loop
   do i = 1, N
      b(i) = a(i)
@@ -60,19 +60,19 @@ program asyncwait
   !$acc enter data copyin (c(1:N), d(1:N)) async (1)
   !$acc update device (a(1:N), b(1:N)) async (1)
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), N)
   do i = 1, N
      b(i) = (a(i) * a(i) * a(i)) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), c(1:N), N)
   do i = 1, N
      c(i) = (a(i) * 4) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), d(1:N), N)
   do i = 1, N
      d(i) = ((a(i) * a(i)  + a(i)) / a(i)) - a(i)
   end do
@@ -98,25 +98,26 @@ program asyncwait
   !$acc enter data copyin (e(1:N)) async (1)
   !$acc update device (a(1:N), b(1:N), c(1:N), d(1:N)) async (1)
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), b(1:N), N)
   do i = 1, N
      b(i) = (a(i) * a(i) * a(i)) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), c(1:N), N)
   do i = 1, N
      c(i) = (a(i) * 4) / a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel async (1)
+  !$acc parallel async (1) present (a(1:N), d(1:N), N)
   do i = 1, N
      d(i) = ((a(i) * a(i) + a(i)) / a(i)) - a(i)
   end do
   !$acc end parallel
 
-  !$acc parallel wait (1) async (1)
+  !$acc parallel wait (1) async (1) present (a(1:N), b(1:N), c(1:N)) &
+  !$acc& present (d(1:N), e(1:N), N)
   do i = 1, N
      e(i) = a(i) + b(i) + c(i) + d(i)
   end do
diff --git libgomp/testsuite/libgomp.oacc-fortran/declare-1.f90 libgomp/testsuite/libgomp.oacc-fortran/declare-1.f90
new file mode 100644
index 0000000..0bab5bd
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/declare-1.f90
@@ -0,0 +1,229 @@
+! { dg-do run  { target openacc_nvidia_accel_selected } }
+
+subroutine subr6 (a, d)
+  integer, parameter :: N = 8
+  integer :: i
+  integer :: a(N)
+  !$acc declare deviceptr (a)
+  integer :: d(N)
+
+  i = 0
+
+  !$acc parallel copy (d)
+    do i = 1, N
+      d(i) = a(i) + a(i)
+    end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine subr5 (a, b, c, d)
+  integer, parameter :: N = 8
+  integer :: i
+  integer :: a(N)
+  !$acc declare present_or_copyin (a)
+  integer :: b(N)
+  !$acc declare present_or_create (b)
+  integer :: c(N)
+  !$acc declare present_or_copyout (c)
+  integer :: d(N)
+  !$acc declare present_or_copy (d)
+
+  i = 0
+
+  !$acc parallel
+    do i = 1, N
+      b(i) = a(i)
+      c(i) = b(i)
+      d(i) = d(i) + b(i)
+    end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine subr4 (a, b)
+  integer, parameter :: N = 8
+  integer :: i
+  integer :: a(N)
+  !$acc declare present (a)
+  integer :: b(N)
+  !$acc declare copyout (b)
+
+  i = 0
+
+  !$acc parallel
+  do i = 1, N
+    b(i) = a(i)
+  end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine subr3 (a, c)
+  integer, parameter :: N = 8
+  integer :: i
+  integer :: a(N)
+  !$acc declare present (a)
+  integer :: c(N)
+  !$acc declare copyin (c)
+
+  i = 0
+
+  !$acc parallel
+  do i = 1, N
+    a(i) = c(i)
+    c(i) = 0
+  end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine subr2 (a, b, c)
+  integer, parameter :: N = 8
+  integer :: i
+  integer :: a(N)
+  !$acc declare present (a)
+  integer :: b(N)
+  !$acc declare create (b)
+  integer :: c(N)
+  !$acc declare copy (c)
+
+  i = 0
+
+  !$acc parallel
+  do i = 1, N
+    b(i) = a(i)
+    c(i) = b(i) + c(i) + 1
+  end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine subr1 (a, b, c)
+  integer, parameter :: N = 8
+  integer :: i
+  integer :: a(N)
+  !$acc declare present (a)
+  integer :: b(N)
+  integer :: c(N)
+
+  i = 0
+
+  !$acc parallel
+  do i = 1, N
+    a(i) = a(i) + 1
+  end do
+  !$acc end parallel
+
+end subroutine
+
+subroutine test (a, e)
+  use openacc
+  logical :: e
+  integer, parameter :: N = 8
+  integer :: a(N)
+
+  if (acc_is_present (a) .neqv. e) call abort
+
+end subroutine
+
+subroutine subr0 (a, b, c, d)
+  integer, parameter :: N = 8
+  integer :: a(N)
+  !$acc declare copy (a)
+  integer :: b(N)
+  integer :: c(N)
+  integer :: d(N)
+
+  call test (a, .true.)
+  call test (b, .false.)
+  call test (c, .false.)
+
+  call subr1 (a, b, c)
+
+  call test (a, .true.)
+  call test (b, .false.)
+  call test (c, .false.)
+
+  call subr2 (a, b, c)
+
+  call test (a, .true.)
+  call test (b, .false.)
+  call test (c, .false.)
+
+  do i = 1, N
+    if (c(i) .ne. 8) call abort
+  end do
+
+  call subr3 (a, c)
+
+  call test (a, .true.)
+  call test (b, .false.)
+  call test (c, .false.)
+
+  do i = 1, N
+    if (a(i) .ne. 2) call abort
+    if (c(i) .ne. 8) call abort
+  end do
+
+  call subr4 (a, b)
+
+  call test (a, .true.)
+  call test (b, .false.)
+  call test (c, .false.)
+
+  do i = 1, N
+    if (b(i) .ne. 8) call abort
+  end do
+
+  call subr5 (a, b, c, d)
+
+  call test (a, .true.)
+  call test (b, .false.)
+  call test (c, .false.)
+  call test (d, .false.)
+
+  do i = 1, N
+    if (c(i) .ne. 8) call abort
+    if (d(i) .ne. 13) call abort
+  end do
+
+  call subr6 (a, d)
+
+  call test (a, .true.)
+  call test (d, .false.)
+
+  do i = 1, N
+    if (d(i) .ne. 16) call abort
+  end do
+
+end subroutine
+
+program main
+  use openacc
+  integer, parameter :: N = 8
+  integer :: a(N)
+  integer :: b(N)
+  integer :: c(N)
+  integer :: d(N)
+
+  a(:) = 2
+  b(:) = 3
+  c(:) = 4
+  d(:) = 5
+
+  call subr0 (a, b, c, d)
+
+  call test (a, .false.)
+  call test (b, .false.)
+  call test (c, .false.)
+  call test (d, .false.)
+
+  do i = 1, N
+    if (a(i) .ne. 8) call abort
+    if (b(i) .ne. 8) call abort
+    if (c(i) .ne. 8) call abort
+    if (d(i) .ne. 16) call abort
+  end do
+
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
new file mode 100644
index 0000000..593cde6
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90
@@ -0,0 +1,24 @@
+! { dg-do run }
+
+program main
+  use openacc
+  implicit none
+
+  integer :: i, n
+
+  n = 1000000
+
+  !$acc parallel async (0)
+    do i = 1, 1000000
+    end do
+  !$acc end parallel
+
+  call acc_wait_async (0, 1)
+
+  if (acc_async_test (0) .neqv. .TRUE.) call abort
+
+  if (acc_async_test (1) .neqv. .TRUE.) call abort
+
+  call acc_wait (1)
+
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90
new file mode 100644
index 0000000..cffda87
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/lib-13.f90
@@ -0,0 +1,28 @@
+! { dg-do run }
+
+program main
+  use openacc
+  implicit none
+
+  integer :: i, j, nprocs
+  integer, parameter :: N = 1000000
+
+  nprocs = 2
+
+  do j = 1, nprocs
+    !$acc parallel async (j)
+      do i = 1, N
+      end do
+    !$acc end parallel
+  end do
+
+  if (acc_async_test (1) .neqv. .TRUE.) call abort
+  if (acc_async_test (2) .neqv. .TRUE.) call abort
+
+  call acc_wait_all_async (nprocs + 1)
+
+  if (acc_async_test (nprocs + 1) .neqv. .TRUE.) call abort
+
+  call acc_wait_all ()
+
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-14.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-14.f90
new file mode 100644
index 0000000..72a2b49
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/lib-14.f90
@@ -0,0 +1,79 @@
+! { dg-do run }
+
+program main
+  use openacc
+  implicit none
+
+  integer, parameter :: N = 256
+  integer, allocatable :: h(:)
+  integer :: i
+
+  allocate (h(N))
+
+  do i = 1, N
+    h(i) = i
+  end do 
+
+  call acc_present_or_copyin (h)
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  call acc_copyout (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+
+  do i = 1, N
+    if (h(i) /= i) call abort
+  end do
+
+  do i = 1, N
+    h(i) = i + i
+  end do 
+
+  call acc_pcopyin (h, sizeof (h))
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  call acc_copyout (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+
+  do i = 1, N
+    if (h(i) /= i + i) call abort
+  end do
+
+  call acc_create (h)
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  !$acc parallel loop
+    do i = 1, N
+      h(i) = i
+    end do
+  !$end acc parallel
+
+  call acc_copyout (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+
+  do i = 1, N
+    if (h(i) /= i) call abort
+  end do
+
+  call acc_present_or_create (h, sizeof (h))
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  call acc_delete (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+
+  call acc_pcreate (h)
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  call acc_delete (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/lib-15.f90 libgomp/testsuite/libgomp.oacc-fortran/lib-15.f90
new file mode 100644
index 0000000..3a834db
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/lib-15.f90
@@ -0,0 +1,52 @@
+! { dg-do run }
+! { dg-skip-if "" { *-*-* } { "*" } { "-DACC_MEM_SHARED=0" } }
+
+program main
+  use openacc
+  implicit none
+
+  integer, parameter :: N = 256
+  integer, allocatable :: h(:)
+  integer :: i
+
+  allocate (h(N))
+
+  do i = 1, N
+    h(i) = i
+  end do 
+
+  call acc_copyin (h)
+
+  do i = 1, N
+    h(i) = i + i
+  end do 
+
+  call acc_update_device (h, sizeof (h))
+
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  h(:) = 0
+
+  call acc_copyout (h, sizeof (h))
+
+  do i = 1, N
+    if (h(i) /= i + i) call abort
+  end do 
+
+  call acc_copyin (h, sizeof (h))
+
+  h(:) = 0
+
+  call acc_update_self (h, sizeof (h))
+  
+  if (acc_is_present (h) .neqv. .TRUE.) call abort
+
+  do i = 1, N
+    if (h(i) /= i + i) call abort
+  end do 
+
+  call acc_delete (h)
+
+  if (acc_is_present (h) .neqv. .FALSE.) call abort
+  
+end program
diff --git libgomp/testsuite/libgomp.oacc-fortran/routine-5.f90 libgomp/testsuite/libgomp.oacc-fortran/routine-5.f90
new file mode 100644
index 0000000..aaeb994
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-fortran/routine-5.f90
@@ -0,0 +1,27 @@
+! { dg-do run }
+! { dg-options "-fno-inline" }
+
+program main
+    integer :: n
+
+    n = 5
+
+    !$acc parallel copy (n)
+      n = func (n)
+    !$acc end parallel
+
+    if (n .ne. 6) call abort
+
+contains
+
+    function func (n) result (rc)
+    !$acc routine gang worker vector seq nohost
+    integer, intent (in) :: n
+    integer :: rc
+
+    rc = n
+    rc = rc + 1
+
+    end function
+
+end program


Grüße,
 Thomas
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 472 bytes
Desc: not available
URL: <http://gcc.gnu.org/pipermail/fortran/attachments/20150505/c44bcbe3/attachment.sig>


More information about the Fortran mailing list