This is the mail archive of the
fortran@gcc.gnu.org
mailing list for the GNU Fortran project.
[gomp4] Extend libgomp's fortran test coverage of host_data
- From: Cesar Philippidis <cesar at codesourcery dot com>
- To: Fortran List <fortran at gcc dot gnu dot org>, "gcc-patches at gcc dot gnu dot org" <gcc-patches at gcc dot gnu dot org>
- Date: Thu, 11 Aug 2016 15:38:55 -0700
- Subject: [gomp4] Extend libgomp's fortran test coverage of host_data
- Authentication-results: sourceware.org; auth=none
This patch ports libgomp.oacc-c-c++-common/host_data-1.c to fortran.
Fortunately, the existing fortran host_data infrastructure was already
in place, so I had to do was port over the calls to Nvidia's CUDA BLAS
library.
There are a couple of details that one needs to consider when using CUDA
BLAS in gfortran. First, if you want to use Nvidia's wrapper functions
written in C to set up the appropriate cuda device contexts, then use
the thunking variants of the functions described here
<http://docs.nvidia.com/cuda/cublas/#appendix-b-cublas-fortran-bindings>.
Otherwise, it's much easier to let gfortran's OpenACC runtime manage the
data mappings and use the host_data clause to pass those data pointers
to the CUDA BLAS library calls.
In terms of calling the actual CUDA BLAS functions, there's already good
documentation for that here
<https://gcc.gnu.org/onlinedocs/gfortran/Interoperability-with-C.html>.
Basically, those library calls need a function interface with a special
C binding. The function I tested in host_data-2.f90 is cublasSaxpy.
Other function interfaces will need to be created as necessary.
I've applied this patch to gomp-4_0-branch.
Cesar
2016-08-11 Cesar Philippidis <cesar@codesourcery.com>
libgomp/
* testsuite/libgomp.oacc-fortran/host_data-1.f90: Remove stale xfail.
* testsuite/libgomp.oacc-fortran/host_data-2.f90: New test.
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/host_data-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/host_data-1.f90
index 497b0f7..69a491d 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/host_data-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/host_data-1.f90
@@ -1,9 +1,6 @@
! { dg-do run }
! { dg-additional-options "-cpp" }
-! { dg-xfail-if "TODO" { *-*-* } }
-! { dg-excess-errors "TODO" }
-
program test
implicit none
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90
new file mode 100644
index 0000000..68e14e3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90
@@ -0,0 +1,85 @@
+! Test host_data interoperability with CUDA blas.
+
+! { dg-do run { target openacc_nvidia_accel_selected } }
+! { dg-additional-options "-lcublas" }
+
+program test
+ implicit none
+
+ integer, parameter :: N = 10
+ integer :: i
+ real*4 :: x_ref(N), y_ref(N), x(N), y(N), a
+
+ interface
+ subroutine cublassaxpy(N, alpha, x, incx, y, incy) bind(c, name="cublasSaxpy")
+ use iso_c_binding
+ integer(kind=c_int), value :: N
+ real*4, value :: alpha
+ type(*), dimension(*) :: x
+ integer(kind=c_int), value :: incx
+ type(*), dimension(*) :: y
+ integer(kind=c_int), value :: incy
+ end subroutine cublassaxpy
+ end interface
+
+ a = 2.0
+
+ do i = 1, N
+ x(i) = 4.0 * i
+ y(i) = 3.0
+ x_ref(i) = x(i)
+ y_ref(i) = y(i)
+ end do
+
+ call saxpy (N, a, x_ref, y_ref)
+
+ !$acc data copyin (x) copy (y)
+ !$acc host_data use_device (x, y)
+ call cublassaxpy(N, a, x, 1, y, 1)
+ !$acc end host_data
+ !$acc end data
+
+ do i = 1, N
+ if (y(i) .ne. y_ref(i)) call abort
+ end do
+
+ !$acc data create (x) copyout (y)
+ !$acc parallel loop
+ do i = 1, N
+ y(i) = 3.0
+ end do
+ !$acc end parallel loop
+
+ !$acc host_data use_device (x, y)
+ call cublassaxpy(N, a, x, 1, y, 1)
+ !$acc end host_data
+ !$acc end data
+
+ do i = 1, N
+ if (y(i) .ne. y_ref(i)) call abort
+ end do
+
+ y(:) = 3.0
+
+ !$acc data copyin (x) copyin (a) copy (y)
+ !$acc parallel present (x) pcopy (y) present (a)
+ call saxpy (N, a, x, y)
+ !$acc end parallel
+ !$acc end data
+
+ do i = 1, N
+ if (y(i) .ne. y_ref(i)) call abort
+ end do
+end program test
+
+subroutine saxpy (nn, aa, xx, yy)
+ integer :: nn
+ real*4 :: aa, xx(nn), yy(nn)
+ integer i
+ real*4 :: t
+ !$acc routine
+
+ do i = 1, nn
+ yy(i) = yy(i) + aa * xx(i)
+ end do
+end subroutine saxpy