This is the mail archive of the
gcc-patches@gcc.gnu.org
mailing list for the GCC project.
Re: [gomp4] Extend libgomp's fortran test coverage of host_data
- From: Cesar Philippidis <cesar at codesourcery dot com>
- To: Fortran List <fortran at gcc dot gnu dot org>, "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Fri, 12 Aug 2016 12:39:40 -0700
- Subject: Re: [gomp4] Extend libgomp's fortran test coverage of host_data
- Authentication-results: sourceware.org; auth=none
- References: <5113e7e2-25c8-e31a-73f3-9c9b2c6edc79@codesourcery.com>
On 08/11/2016 03:38 PM, Cesar Philippidis wrote:
> This patch ports libgomp.oacc-c-c++-common/host_data-1.c to fortran.
> Fortunately, the existing fortran host_data infrastructure was already
> in place, so I had to do was port over the calls to Nvidia's CUDA BLAS
> library.
>
> There are a couple of details that one needs to consider when using CUDA
> BLAS in gfortran. First, if you want to use Nvidia's wrapper functions
> written in C to set up the appropriate cuda device contexts, then use
> the thunking variants of the functions described here
> <http://docs.nvidia.com/cuda/cublas/#appendix-b-cublas-fortran-bindings>.
> Otherwise, it's much easier to let gfortran's OpenACC runtime manage the
> data mappings and use the host_data clause to pass those data pointers
> to the CUDA BLAS library calls.
>
> In terms of calling the actual CUDA BLAS functions, there's already good
> documentation for that here
> <https://gcc.gnu.org/onlinedocs/gfortran/Interoperability-with-C.html>.
> Basically, those library calls need a function interface with a special
> C binding. The function I tested in host_data-2.f90 is cublasSaxpy.
> Other function interfaces will need to be created as necessary.
>
> I've applied this patch to gomp-4_0-branch.
I've added some additional test coverage in this patch. Specifically,
I've included both a module and fixed-mode test. I also corrected some
problems when host_data-2.f90 and host_data-1.c are built with -Wall
-Wextra.
This patch has been applied to gomp-4_0-branch.
Cesar
2016-08-12 Cesar Philippidis <cesar@codesourcery.com>
libgomp/
* testsuite/libgomp.oacc-c-c++-common/host_data-1.c: Increase test
coverage. Build with -Wall -Wextra.
* testsuite/libgomp.oacc-fortran/host_data-2.f90: Likewise.
* testsuite/libgomp.oacc-fortran/cublas-fixed.h: New test.
* testsuite/libgomp.oacc-fortran/host_data-3.f: New test.
* testsuite/libgomp.oacc-fortran/host_data-4.f90: New test.
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c
index d19aa20..fe843ec 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c
@@ -1,14 +1,16 @@
/* { dg-do run { target openacc_nvidia_accel_selected } } */
-/* { dg-additional-options "-lcuda -lcublas -lcudart" } */
+/* { dg-additional-options "-lm -lcuda -lcublas -lcudart -Wall -Wextra" } */
#include <stdlib.h>
+#include <math.h>
#include <openacc.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
+#pragma acc routine
void
-saxpy_host (int n, float a, float *x, float *y)
+saxpy (int n, float a, float *x, float *y)
{
int i;
@@ -16,18 +18,18 @@ saxpy_host (int n, float a, float *x, float *y)
y[i] = y[i] + a * x[i];
}
-#pragma acc routine
void
-saxpy_target (int n, float a, float *x, float *y)
+validate_results (int n, float *a, float *b)
{
int i;
for (i = 0; i < n; i++)
- y[i] = y[i] + a * x[i];
+ if (fabs (a[i] - b[i]) > .00001)
+ abort ();
}
int
-main(int argc, char **argv)
+main()
{
/* We're using cuBLAS, so excpect to be using a Nvidia GPU. */
acc_init (acc_device_nvidia);
@@ -45,7 +47,7 @@ main(int argc, char **argv)
y[i] = y_ref[i] = 3.0;
}
- saxpy_host (N, a, x_ref, y_ref);
+ saxpy (N, a, x_ref, y_ref);
cublasCreate (&h);
@@ -57,11 +59,7 @@ main(int argc, char **argv)
}
}
- for (i = 0; i < N; i++)
- {
- if (y[i] != y_ref[i])
- abort ();
- }
+ validate_results (N, y, y_ref);
#pragma acc data create (x[0:N]) copyout (y[0:N])
{
@@ -77,11 +75,7 @@ main(int argc, char **argv)
cublasDestroy (h);
- for (i = 0; i < N; i++)
- {
- if (y[i] != y_ref[i])
- abort ();
- }
+ validate_results (N, y, y_ref);
for (i = 0; i < N; i++)
y[i] = 3.0;
@@ -90,14 +84,24 @@ main(int argc, char **argv)
#pragma acc data copyin (x[0:N]) copyin (a) copy (y[0:N])
{
#pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a)
- saxpy_target (N, a, x, y);
+ saxpy (N, a, x, y);
}
+ validate_results (N, y, y_ref);
+
+ /* Exercise host_data with data transferred with acc enter data. */
+
for (i = 0; i < N; i++)
- {
- if (y[i] != y_ref[i])
- abort ();
- }
+ y[i] = 3.0;
+
+#pragma acc enter data copyin (x, a, y)
+#pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a)
+ {
+ saxpy (N, a, x, y);
+ }
+#pragma acc exit data delete (x, a) copyout (y)
+
+ validate_results (N, y, y_ref);
return 0;
}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h b/libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h
new file mode 100644
index 0000000..4a5f61a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h
@@ -0,0 +1,16 @@
+! CUDA BLAS interface binding for SAXPY.
+
+ use iso_c_binding
+ interface
+ subroutine cublassaxpy(N, alpha, x, incx, y, incy)
+ 1 bind(c, name="cublasSaxpy")
+ use iso_c_binding
+ integer(kind=c_int), value :: N
+ real(kind=c_float), value :: alpha
+ type(*), dimension(*) :: x
+ integer(kind=c_int), value :: incx
+ type(*), dimension(*) :: y
+ integer(kind=c_int), value :: incy
+ end subroutine cublassaxpy
+ end interface
+
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90
index 68e14e3..ff09218 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90
@@ -1,7 +1,8 @@
-! Test host_data interoperability with CUDA blas.
+! Test host_data interoperability with CUDA blas. This test was
+! derived from libgomp.oacc-c-c++-common/host_data-1.c.
! { dg-do run { target openacc_nvidia_accel_selected } }
-! { dg-additional-options "-lcublas" }
+! { dg-additional-options "-lcublas -Wall -Wextra" }
program test
implicit none
@@ -14,7 +15,7 @@ program test
subroutine cublassaxpy(N, alpha, x, incx, y, incy) bind(c, name="cublasSaxpy")
use iso_c_binding
integer(kind=c_int), value :: N
- real*4, value :: alpha
+ real(kind=c_float), value :: alpha
type(*), dimension(*) :: x
integer(kind=c_int), value :: incx
type(*), dimension(*) :: y
@@ -32,16 +33,14 @@ program test
end do
call saxpy (N, a, x_ref, y_ref)
-
+
!$acc data copyin (x) copy (y)
!$acc host_data use_device (x, y)
call cublassaxpy(N, a, x, 1, y, 1)
!$acc end host_data
!$acc end data
-
- do i = 1, N
- if (y(i) .ne. y_ref(i)) call abort
- end do
+
+ call validate_results (N, y, y_ref)
!$acc data create (x) copyout (y)
!$acc parallel loop
@@ -55,31 +54,45 @@ program test
!$acc end host_data
!$acc end data
- do i = 1, N
- if (y(i) .ne. y_ref(i)) call abort
- end do
+ call validate_results (N, y, y_ref)
y(:) = 3.0
-
+
!$acc data copyin (x) copyin (a) copy (y)
!$acc parallel present (x) pcopy (y) present (a)
call saxpy (N, a, x, y)
!$acc end parallel
!$acc end data
- do i = 1, N
- if (y(i) .ne. y_ref(i)) call abort
- end do
+ call validate_results (N, y, y_ref)
+
+ y(:) = 3.0
+
+ !$acc enter data copyin (x, a, y)
+ !$acc parallel present (x) pcopy (y) present (a)
+ call saxpy (N, a, x, y)
+ !$acc end parallel
+ !$acc exit data delete (x, a) copyout (y)
+
+ call validate_results (N, y, y_ref)
end program test
subroutine saxpy (nn, aa, xx, yy)
integer :: nn
real*4 :: aa, xx(nn), yy(nn)
integer i
- real*4 :: t
!$acc routine
do i = 1, nn
yy(i) = yy(i) + aa * xx(i)
end do
end subroutine saxpy
+
+subroutine validate_results (n, a, b)
+ integer :: n
+ real*4 :: a(n), b(n)
+
+ do i = 1, N
+ if (abs(a(i) - b(i)) > 0.0001) call abort
+ end do
+end subroutine validate_results
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f b/libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f
new file mode 100644
index 0000000..05ed949
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f
@@ -0,0 +1,85 @@
+! Fixed-mode host_data interaction with CUDA BLAS.
+
+! { dg-do run { target openacc_nvidia_accel_selected } }
+! { dg-additional-options "-lcublas -Wall -Wextra" }
+
+ include "cublas-fixed.h"
+
+ integer, parameter :: N = 10
+ integer :: i
+ real*4 :: x_ref(N), y_ref(N), x(N), y(N), a
+
+ a = 2.0
+
+ do i = 1, N
+ x(i) = 4.0 * i
+ y(i) = 3.0
+ x_ref(i) = x(i)
+ y_ref(i) = y(i)
+ end do
+
+ call saxpy (N, a, x_ref, y_ref)
+
+!$acc data copyin (x) copy (y)
+!$acc host_data use_device (x, y)
+ call cublassaxpy(N, a, x, 1, y, 1)
+!$acc end host_data
+!$acc end data
+
+ call validate_results (N, y, y_ref)
+
+!$acc data create (x) copyout (y)
+!$acc parallel loop
+ do i = 1, N
+ y(i) = 3.0
+ end do
+!$acc end parallel loop
+
+!$acc host_data use_device (x, y)
+ call cublassaxpy(N, a, x, 1, y, 1)
+!$acc end host_data
+!$acc end data
+
+ call validate_results (N, y, y_ref)
+
+ y(:) = 3.0
+
+!$acc data copyin (x) copyin (a) copy (y)
+!$acc parallel present (x) pcopy (y) present (a)
+ call saxpy (N, a, x, y)
+!$acc end parallel
+!$acc end data
+
+ call validate_results (N, y, y_ref)
+
+ y(:) = 3.0
+
+!$acc enter data copyin (x, a, y)
+!$acc parallel present (x) pcopy (y) present (a)
+ call saxpy (N, a, x, y)
+!$acc end parallel
+!$acc exit data delete (x, a) copyout (y)
+
+ call validate_results (N, y, y_ref)
+ end
+
+ subroutine saxpy (nn, aa, xx, yy)
+ integer :: nn
+ real*4 :: aa, xx(nn), yy(nn)
+ integer i
+!$acc routine
+
+ do i = 1, nn
+ yy(i) = yy(i) + aa * xx(i)
+ end do
+ end subroutine saxpy
+
+ subroutine validate_results (n, a, b)
+ integer :: n
+ real*4 :: a(n), b(n)
+
+ do i = 1, N
+ if (abs(a(i) - b(i)) > 0.0001) call abort
+ end do
+ end subroutine validate_results
+
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90 b/libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90
new file mode 100644
index 0000000..6e379b5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90
@@ -0,0 +1,101 @@
+! Test host_data interoperability with CUDA blas using modules.
+
+! { dg-do run { target openacc_nvidia_accel_selected } }
+! { dg-additional-options "-lcublas -Wall -Wextra" }
+
+module cublas
+ interface
+ subroutine cublassaxpy(N, alpha, x, incx, y, incy) bind(c, name="cublasSaxpy")
+ use iso_c_binding
+ integer(kind=c_int), value :: N
+ real(kind=c_float), value :: alpha
+ type(*), dimension(*) :: x
+ integer(kind=c_int), value :: incx
+ type(*), dimension(*) :: y
+ integer(kind=c_int), value :: incy
+ end subroutine cublassaxpy
+ end interface
+
+contains
+ subroutine saxpy (nn, aa, xx, yy)
+ integer :: nn
+ real*4 :: aa, xx(nn), yy(nn)
+ integer i
+ !$acc routine
+
+ do i = 1, nn
+ yy(i) = yy(i) + aa * xx(i)
+ end do
+ end subroutine saxpy
+
+ subroutine validate_results (n, a, b)
+ integer :: n
+ real*4 :: a(n), b(n)
+
+ do i = 1, N
+ if (abs(a(i) - b(i)) > 0.0001) call abort
+ end do
+ end subroutine validate_results
+end module cublas
+
+program test
+ use cublas
+ implicit none
+
+ integer, parameter :: N = 10
+ integer :: i
+ real*4 :: x_ref(N), y_ref(N), x(N), y(N), a
+
+ a = 2.0
+
+ do i = 1, N
+ x(i) = 4.0 * i
+ y(i) = 3.0
+ x_ref(i) = x(i)
+ y_ref(i) = y(i)
+ end do
+
+ call saxpy (N, a, x_ref, y_ref)
+
+ !$acc data copyin (x) copy (y)
+ !$acc host_data use_device (x, y)
+ call cublassaxpy(N, a, x, 1, y, 1)
+ !$acc end host_data
+ !$acc end data
+
+ call validate_results (N, y, y_ref)
+
+ !$acc data create (x) copyout (y)
+ !$acc parallel loop
+ do i = 1, N
+ y(i) = 3.0
+ end do
+ !$acc end parallel loop
+
+ !$acc host_data use_device (x, y)
+ call cublassaxpy(N, a, x, 1, y, 1)
+ !$acc end host_data
+ !$acc end data
+
+ call validate_results (N, y, y_ref)
+
+ y(:) = 3.0
+
+ !$acc data copyin (x) copyin (a) copy (y)
+ !$acc parallel present (x) pcopy (y) present (a)
+ call saxpy (N, a, x, y)
+ !$acc end parallel
+ !$acc end data
+
+ call validate_results (N, y, y_ref)
+
+ y(:) = 3.0
+
+ !$acc enter data copyin (x, a, y)
+ !$acc parallel present (x) pcopy (y) present (a)
+ call saxpy (N, a, x, y)
+ !$acc end parallel
+ !$acc exit data delete (x, a) copyout (y)
+
+ call validate_results (N, y, y_ref)
+end program test