This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [gomp4] Extend libgomp's fortran test coverage of host_data


On 08/11/2016 03:38 PM, Cesar Philippidis wrote:
> This patch ports libgomp.oacc-c-c++-common/host_data-1.c to fortran.
> Fortunately, the existing fortran host_data infrastructure was already
> in place, so I had to do was port over the calls to Nvidia's CUDA BLAS
> library.
> 
> There are a couple of details that one needs to consider when using CUDA
> BLAS in gfortran. First, if you want to use Nvidia's wrapper functions
> written in C to set up the appropriate cuda device contexts, then use
> the thunking variants of the functions described here
> <http://docs.nvidia.com/cuda/cublas/#appendix-b-cublas-fortran-bindings>.
> Otherwise, it's much easier to let gfortran's OpenACC runtime manage the
> data mappings and use the host_data clause to pass those data pointers
> to the CUDA BLAS library calls.
> 
> In terms of calling the actual CUDA BLAS functions, there's already good
> documentation for that here
> <https://gcc.gnu.org/onlinedocs/gfortran/Interoperability-with-C.html>.
> Basically, those library calls need a function interface with a special
> C binding. The function I tested in host_data-2.f90 is cublasSaxpy.
> Other function interfaces will need to be created as necessary.
> 
> I've applied this patch to gomp-4_0-branch.

I've added some additional test coverage in this patch. Specifically,
I've included both a module and fixed-mode test. I also corrected some
problems when host_data-2.f90 and host_data-1.c are built with -Wall
-Wextra.

This patch has been applied to gomp-4_0-branch.

Cesar
2016-08-12  Cesar Philippidis  <cesar@codesourcery.com>

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/host_data-1.c: Increase test
	coverage.  Build with -Wall -Wextra.
	* testsuite/libgomp.oacc-fortran/host_data-2.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/cublas-fixed.h: New test.
	* testsuite/libgomp.oacc-fortran/host_data-3.f: New test.
	* testsuite/libgomp.oacc-fortran/host_data-4.f90: New test.


diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c
index d19aa20..fe843ec 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c
@@ -1,14 +1,16 @@
 /* { dg-do run { target openacc_nvidia_accel_selected } } */
-/* { dg-additional-options "-lcuda -lcublas -lcudart" } */
+/* { dg-additional-options "-lm -lcuda -lcublas -lcudart -Wall -Wextra" } */
 
 #include <stdlib.h>
+#include <math.h>
 #include <openacc.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 
+#pragma acc routine
 void
-saxpy_host (int n, float a, float *x, float *y)
+saxpy (int n, float a, float *x, float *y)
 {
   int i;
 
@@ -16,18 +18,18 @@ saxpy_host (int n, float a, float *x, float *y)
     y[i] = y[i] + a * x[i];
 }
 
-#pragma acc routine
 void
-saxpy_target (int n, float a, float *x, float *y)
+validate_results (int n, float *a, float *b)
 {
   int i;
 
   for (i = 0; i < n; i++)
-    y[i] = y[i] + a * x[i];
+    if (fabs (a[i] - b[i]) > .00001)
+      abort ();
 }
 
 int
-main(int argc, char **argv)
+main()
 {
   /* We're using cuBLAS, so excpect to be using a Nvidia GPU.  */
   acc_init (acc_device_nvidia);
@@ -45,7 +47,7 @@ main(int argc, char **argv)
       y[i] = y_ref[i] = 3.0;
     }
 
-  saxpy_host (N, a, x_ref, y_ref);
+  saxpy (N, a, x_ref, y_ref);
 
   cublasCreate (&h);
 
@@ -57,11 +59,7 @@ main(int argc, char **argv)
     }
   }
 
-  for (i = 0; i < N; i++)
-    {
-      if (y[i] != y_ref[i])
-        abort ();
-    }
+  validate_results (N, y, y_ref);
 
 #pragma acc data create (x[0:N]) copyout (y[0:N])
   {
@@ -77,11 +75,7 @@ main(int argc, char **argv)
 
   cublasDestroy (h);
 
-  for (i = 0; i < N; i++)
-    {
-      if (y[i] != y_ref[i])
-        abort ();
-    }
+  validate_results (N, y, y_ref);
 
   for (i = 0; i < N; i++)
     y[i] = 3.0;
@@ -90,14 +84,24 @@ main(int argc, char **argv)
 #pragma acc data copyin (x[0:N]) copyin (a) copy (y[0:N])
   {
 #pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a)
-    saxpy_target (N, a, x, y);
+    saxpy (N, a, x, y);
   }
 
+  validate_results (N, y, y_ref);
+
+  /* Exercise host_data with data transferred with acc enter data.  */
+
   for (i = 0; i < N; i++)
-    {
-      if (y[i] != y_ref[i])
-        abort ();
-    }
+    y[i] = 3.0;
+
+#pragma acc enter data copyin (x, a, y)
+#pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a)
+  {
+    saxpy (N, a, x, y);
+  }
+#pragma acc exit data delete (x, a) copyout (y)
+
+  validate_results (N, y, y_ref);
 
   return 0;
 }
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h b/libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h
new file mode 100644
index 0000000..4a5f61a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h
@@ -0,0 +1,16 @@
+! CUDA BLAS interface binding for SAXPY.
+      
+      use iso_c_binding
+      interface
+        subroutine cublassaxpy(N, alpha, x, incx, y, incy)
+     1    bind(c, name="cublasSaxpy")
+          use iso_c_binding
+          integer(kind=c_int), value :: N
+          real(kind=c_float), value :: alpha
+          type(*), dimension(*) :: x
+          integer(kind=c_int), value :: incx
+          type(*), dimension(*) :: y
+          integer(kind=c_int), value :: incy
+        end subroutine cublassaxpy
+      end interface
+
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90
index 68e14e3..ff09218 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90
@@ -1,7 +1,8 @@
-! Test host_data interoperability with CUDA blas.
+! Test host_data interoperability with CUDA blas.  This test was
+! derived from libgomp.oacc-c-c++-common/host_data-1.c.
 
 ! { dg-do run { target openacc_nvidia_accel_selected } }
-! { dg-additional-options "-lcublas" }
+! { dg-additional-options "-lcublas -Wall -Wextra" }
 
 program test
   implicit none
@@ -14,7 +15,7 @@ program test
      subroutine cublassaxpy(N, alpha, x, incx, y, incy) bind(c, name="cublasSaxpy")
        use iso_c_binding
        integer(kind=c_int), value :: N
-       real*4, value :: alpha
+       real(kind=c_float), value :: alpha
        type(*), dimension(*) :: x
        integer(kind=c_int), value :: incx
        type(*), dimension(*) :: y
@@ -32,16 +33,14 @@ program test
   end do
 
   call saxpy (N, a, x_ref, y_ref)
-  
+
   !$acc data copyin (x) copy (y)
   !$acc host_data use_device (x, y)
   call cublassaxpy(N, a, x, 1, y, 1)
   !$acc end host_data
   !$acc end data
-  
-  do i = 1, N
-     if (y(i) .ne. y_ref(i)) call abort
-  end do
+
+  call validate_results (N, y, y_ref)
 
   !$acc data create (x) copyout (y)
   !$acc parallel loop
@@ -55,31 +54,45 @@ program test
   !$acc end host_data
   !$acc end data
 
-  do i = 1, N
-     if (y(i) .ne. y_ref(i)) call abort
-  end do
+  call validate_results (N, y, y_ref)
 
   y(:) = 3.0
-  
+
   !$acc data copyin (x) copyin (a) copy (y)
   !$acc parallel present (x) pcopy (y) present (a)
   call saxpy (N, a, x, y)
   !$acc end parallel
   !$acc end data
 
-  do i = 1, N
-     if (y(i) .ne. y_ref(i)) call abort
-  end do
+  call validate_results (N, y, y_ref)
+
+  y(:) = 3.0
+
+  !$acc enter data copyin (x, a, y)
+  !$acc parallel present (x) pcopy (y) present (a)
+  call saxpy (N, a, x, y)
+  !$acc end parallel
+  !$acc exit data delete (x, a) copyout (y)
+
+  call validate_results (N, y, y_ref)
 end program test
 
 subroutine saxpy (nn, aa, xx, yy)
   integer :: nn
   real*4 :: aa, xx(nn), yy(nn)
   integer i
-  real*4 :: t
   !$acc routine
 
   do i = 1, nn
     yy(i) = yy(i) + aa * xx(i)
   end do
 end subroutine saxpy
+
+subroutine validate_results (n, a, b)
+  integer :: n
+  real*4 :: a(n), b(n)
+
+  do i = 1, N
+     if (abs(a(i) - b(i)) > 0.0001) call abort
+  end do
+end subroutine validate_results
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f b/libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f
new file mode 100644
index 0000000..05ed949
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f
@@ -0,0 +1,85 @@
+! Fixed-mode host_data interaction with CUDA BLAS.
+
+! { dg-do run { target openacc_nvidia_accel_selected } }
+! { dg-additional-options "-lcublas -Wall -Wextra" }
+
+      include "cublas-fixed.h"
+
+      integer, parameter :: N = 10
+      integer :: i
+      real*4 :: x_ref(N), y_ref(N), x(N), y(N), a
+
+      a = 2.0
+
+      do i = 1, N
+         x(i) = 4.0 * i
+         y(i) = 3.0
+         x_ref(i) = x(i)
+         y_ref(i) = y(i)
+      end do
+
+      call saxpy (N, a, x_ref, y_ref)
+  
+!$acc data copyin (x) copy (y)
+!$acc host_data use_device (x, y)
+      call cublassaxpy(N, a, x, 1, y, 1)
+!$acc end host_data
+!$acc end data
+
+      call validate_results (N, y, y_ref)
+
+!$acc data create (x) copyout (y)
+!$acc parallel loop
+      do i = 1, N
+         y(i) = 3.0
+      end do
+!$acc end parallel loop
+
+!$acc host_data use_device (x, y)
+      call cublassaxpy(N, a, x, 1, y, 1)
+!$acc end host_data
+!$acc end data
+
+      call validate_results (N, y, y_ref)
+
+      y(:) = 3.0
+  
+!$acc data copyin (x) copyin (a) copy (y)
+!$acc parallel present (x) pcopy (y) present (a)
+      call saxpy (N, a, x, y)
+!$acc end parallel
+!$acc end data
+
+      call validate_results (N, y, y_ref)
+
+      y(:) = 3.0
+  
+!$acc enter data copyin (x, a, y)
+!$acc parallel present (x) pcopy (y) present (a)
+      call saxpy (N, a, x, y)
+!$acc end parallel
+!$acc exit data delete (x, a) copyout (y)
+
+      call validate_results (N, y, y_ref)
+      end
+
+      subroutine saxpy (nn, aa, xx, yy)
+      integer :: nn
+      real*4 :: aa, xx(nn), yy(nn)
+      integer i
+!$acc routine
+
+      do i = 1, nn
+         yy(i) = yy(i) + aa * xx(i)
+      end do
+      end subroutine saxpy
+
+      subroutine validate_results (n, a, b)
+      integer :: n
+      real*4 :: a(n), b(n)
+
+      do i = 1, N
+         if (abs(a(i) - b(i)) > 0.0001) call abort
+      end do
+      end subroutine validate_results
+
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90 b/libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90
new file mode 100644
index 0000000..6e379b5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90
@@ -0,0 +1,101 @@
+! Test host_data interoperability with CUDA blas using modules.
+
+! { dg-do run { target openacc_nvidia_accel_selected } }
+! { dg-additional-options "-lcublas -Wall -Wextra" }
+
+module cublas
+  interface
+     subroutine cublassaxpy(N, alpha, x, incx, y, incy) bind(c, name="cublasSaxpy")
+       use iso_c_binding
+       integer(kind=c_int), value :: N
+       real(kind=c_float), value :: alpha
+       type(*), dimension(*) :: x
+       integer(kind=c_int), value :: incx
+       type(*), dimension(*) :: y
+       integer(kind=c_int), value :: incy
+     end subroutine cublassaxpy
+  end interface
+
+contains
+  subroutine saxpy (nn, aa, xx, yy)
+    integer :: nn
+    real*4 :: aa, xx(nn), yy(nn)
+    integer i
+    !$acc routine
+
+    do i = 1, nn
+       yy(i) = yy(i) + aa * xx(i)
+    end do
+  end subroutine saxpy
+
+  subroutine validate_results (n, a, b)
+    integer :: n
+    real*4 :: a(n), b(n)
+
+    do i = 1, N
+       if (abs(a(i) - b(i)) > 0.0001) call abort
+    end do
+  end subroutine validate_results
+end module cublas
+
+program test
+  use cublas
+  implicit none
+
+  integer, parameter :: N = 10
+  integer :: i
+  real*4 :: x_ref(N), y_ref(N), x(N), y(N), a
+
+  a = 2.0
+
+  do i = 1, N
+     x(i) = 4.0 * i
+     y(i) = 3.0
+     x_ref(i) = x(i)
+     y_ref(i) = y(i)
+  end do
+
+  call saxpy (N, a, x_ref, y_ref)
+
+  !$acc data copyin (x) copy (y)
+  !$acc host_data use_device (x, y)
+  call cublassaxpy(N, a, x, 1, y, 1)
+  !$acc end host_data
+  !$acc end data
+
+  call validate_results (N, y, y_ref)
+
+  !$acc data create (x) copyout (y)
+  !$acc parallel loop
+  do i = 1, N
+     y(i) = 3.0
+  end do
+  !$acc end parallel loop
+
+  !$acc host_data use_device (x, y)
+  call cublassaxpy(N, a, x, 1, y, 1)
+  !$acc end host_data
+  !$acc end data
+
+  call validate_results (N, y, y_ref)
+
+  y(:) = 3.0
+
+  !$acc data copyin (x) copyin (a) copy (y)
+  !$acc parallel present (x) pcopy (y) present (a)
+  call saxpy (N, a, x, y)
+  !$acc end parallel
+  !$acc end data
+
+  call validate_results (N, y, y_ref)
+
+  y(:) = 3.0
+
+  !$acc enter data copyin (x, a, y)
+  !$acc parallel present (x) pcopy (y) present (a)
+  call saxpy (N, a, x, y)
+  !$acc end parallel
+  !$acc exit data delete (x, a) copyout (y)
+
+  call validate_results (N, y, y_ref)
+end program test

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]