This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PING] [PING] Make the OpenACC C++ acc_on_device wrapper "always inline"


Hi!

Ping.

On Tue, 30 May 2017 14:35:29 +0200, I wrote:
> Ping.
> 
> On Tue, 23 May 2017 17:31:11 +0200, I wrote:
> > On Thu, 29 Oct 2015 17:22:46 -0700, Nathan Sidwell <nathan@acm.org> wrote:
> > > acc_on_device and it's builtin had a conflict.  The function formally takes an 
> > > enum argument, but the builtin takes an int -- primarily to avoid the compiler 
> > > having to generate the enum  type internally.
> > > 
> > > This works fine for C,  where the external declaration of the function (in 
> > > openacc.h) matches up with the builtin, and we optimize the builtin as expected.
> > > 
> > > It fails for C++ where the builtin doesn't match the declaration in the header. 
> > >   We end up with emitting a call to acc_on_device,  which is resolved by 
> > > libgomp.  Unfortunately that means we fail to optimize.  [...]
> > 
> > > [Nathan's trunk r229562] leaves things unchanged for C --  declare a function with an enum arg. 
> > >   But for C++ we the extern "C" declaration takes an int -- and therefore 
> > > matches the builtin.  We insert an inline wrapper that takes an enum argument. 
> > > Because of C++'s overload resolution both the wrapper and the int-taking 
> > > declaration can have the same source name.
> > 
> > > --- libgomp/openacc.h	(revision 229535)
> > > +++ libgomp/openacc.h	(working copy)
> > 
> > > -int acc_on_device (acc_device_t) __GOACC_NOTHROW;
> > > +#ifdef __cplusplus
> > > +int acc_on_device (int __arg) __GOACC_NOTHROW;
> > > +#else
> > > +int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW;
> > > +#endif
> > 
> > >  #ifdef __cplusplus
> > >  }
> > > +
> > > +/* Forwarding function with correctly typed arg.  */
> > > +
> > > +inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
> > > +{
> > > +  return acc_on_device ((int) __arg);
> > > +}
> > >  #endif
> > 
> > > --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c	(revision 0)
> > > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c	(working copy)
> > > @@ -0,0 +1,12 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-additional-options "-O2" } */
> > > +
> > > +#include <openacc.h>
> > > +
> > > +int Foo (acc_device_t x)
> > > +{
> > > +  return acc_on_device (x);
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-not "acc_on_device" } } */
> > 
> > As a user, I'd expect that when compiling such code with "-O0" instead of
> > "-O2", but adding "__attribute__ ((optimize ("O2")))" to "Foo", that I'd
> > then get "acc_on_device" expanded as a builtin, and no calls to the
> > "acc_on_device library function.  In C++ that is currently not working,
> > because the "Forwarding function with correctly typed arg" (cited above)
> > doesn't "inherit" that "optimize" attribute.  Making that one "always
> > inline" resolves the problem.  Also I cleaned up and extended testing
> > some more.  OK for trunk?
> > 
> > commit 9cc3a384c17e9f692f7864c604d2e2f9fbf0bac9
> > Author: Thomas Schwinge <thomas@codesourcery.com>
> > Date:   Tue May 23 13:21:14 2017 +0200
> > 
> >     Make the OpenACC C++ acc_on_device wrapper "always inline"
> >     
> >             libgomp/
> >             * openacc.h [__cplusplus] (acc_on_device): Mark as "always
> >             inline".
> >             * testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c: Remove
> >             file; test cases already present...
> >             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: ... in
> >             this file.  Update.
> >             * testsuite/libgomp.oacc-c-c++-common/acc-on-device.c: Remove
> >             file; test cases now present...
> >             * testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c: ... in
> >             this new file.
> >             * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Update.
> > ---
> >  libgomp/openacc.h                                  |  3 +-
> >  .../libgomp.oacc-c-c++-common/acc-on-device-2.c    | 22 -------------
> >  .../libgomp.oacc-c-c++-common/acc-on-device.c      | 12 -------
> >  .../libgomp.oacc-c-c++-common/acc_on_device-1.c    | 38 +++++++++++++---------
> >  .../libgomp.oacc-c-c++-common/acc_on_device-2.c    | 21 ++++++++++++
> >  .../libgomp.oacc-c-c++-common/parallel-dims.c      | 14 ++++----
> >  6 files changed, 52 insertions(+), 58 deletions(-)
> > 
> > diff --git libgomp/openacc.h libgomp/openacc.h
> > index 137e2c1..266f559 100644
> > --- libgomp/openacc.h
> > +++ libgomp/openacc.h
> > @@ -121,7 +121,8 @@ int acc_set_cuda_stream (int, void *) __GOACC_NOTHROW;
> >  /* Forwarding function with correctly typed arg.  */
> >  
> >  #pragma acc routine seq
> > -inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
> > +inline __attribute__ ((__always_inline__)) int
> > +acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
> >  {
> >    return acc_on_device ((int) __arg);
> >  }
> > diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
> > deleted file mode 100644
> > index bfcb67d..0000000
> > --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
> > +++ /dev/null
> > @@ -1,22 +0,0 @@
> > -/* Test the acc_on_device library function. */
> > -/* { dg-additional-options "-fno-builtin-acc_on_device" } */
> > -
> > -#include <openacc.h>
> > -
> > -int main ()
> > -{
> > -  int dev;
> > -  
> > -#pragma acc parallel copyout (dev)
> > -  {
> > -    dev = acc_on_device (acc_device_not_host);
> > -  }
> > -
> > -  int expect = 1;
> > -  
> > -#if  ACC_DEVICE_TYPE_host
> > -  expect = 0;
> > -#endif
> > -  
> > -  return dev != expect;
> > -}
> > diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
> > deleted file mode 100644
> > index e0d8710..0000000
> > --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
> > +++ /dev/null
> > @@ -1,12 +0,0 @@
> > -/* { dg-do compile } */
> > -/* We don't expect this to work with optimizations disabled.
> > -   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
> > -
> > -#include <openacc.h>
> > -
> > -int Foo (acc_device_t x)
> > -{
> > -  return acc_on_device (x);
> > -}
> > -
> > -/* { dg-final { scan-assembler-not "acc_on_device" } } */
> > diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
> > index 8112745..eb962e4 100644
> > --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
> > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
> > @@ -1,6 +1,9 @@
> >  /* Disable the acc_on_device builtin; we want to test the libgomp library
> >     function.  */
> > +/* { dg-additional-options "-DACC_ON_DEVICE=acc_on_device" } */
> >  /* { dg-additional-options "-fno-builtin-acc_on_device" } */
> > +/* { dg-additional-options "-fdump-rtl-expand" }
> > +   { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 12 "expand" } } */
> >  
> >  #include <stdlib.h>
> >  #include <openacc.h>
> > @@ -11,13 +14,13 @@ main (int argc, char *argv[])
> >    /* Host.  */
> >  
> >    {
> > -    if (!acc_on_device (acc_device_none))
> > +    if (!ACC_ON_DEVICE (acc_device_none))
> >        abort ();
> > -    if (!acc_on_device (acc_device_host))
> > +    if (!ACC_ON_DEVICE (acc_device_host))
> >        abort ();
> > -    if (acc_on_device (acc_device_not_host))
> > +    if (ACC_ON_DEVICE (acc_device_not_host))
> >        abort ();
> > -    if (acc_on_device (acc_device_nvidia))
> > +    if (ACC_ON_DEVICE (acc_device_nvidia))
> >        abort ();
> >    }
> >  
> > @@ -26,39 +29,44 @@ main (int argc, char *argv[])
> >  
> >  #pragma acc parallel if(0)
> >    {
> > -    if (!acc_on_device (acc_device_none))
> > +    if (!ACC_ON_DEVICE (acc_device_none))
> >        abort ();
> > -    if (!acc_on_device (acc_device_host))
> > +    if (!ACC_ON_DEVICE (acc_device_host))
> >        abort ();
> > -    if (acc_on_device (acc_device_not_host))
> > +    if (ACC_ON_DEVICE (acc_device_not_host))
> >        abort ();
> > -    if (acc_on_device (acc_device_nvidia))
> > +    if (ACC_ON_DEVICE (acc_device_nvidia))
> >        abort ();
> >    }
> >  
> >  
> > -#if !ACC_DEVICE_TYPE_host
> > +  int on_host_p;
> > +#if ACC_DEVICE_TYPE_host
> > +  on_host_p = 1;
> > +#else
> > +  on_host_p = 0;
> > +#endif
> >  
> >    /* Offloaded.  */
> >  
> >  #pragma acc parallel
> >    {
> > -    if (acc_on_device (acc_device_none))
> > +    if (on_host_p != ACC_ON_DEVICE (acc_device_none))
> >        abort ();
> > -    if (acc_on_device (acc_device_host))
> > +    if (on_host_p != ACC_ON_DEVICE (acc_device_host))
> >        abort ();
> > -    if (!acc_on_device (acc_device_not_host))
> > +    if (on_host_p == ACC_ON_DEVICE (acc_device_not_host))
> >        abort ();
> > +
> >  #if ACC_DEVICE_TYPE_nvidia
> > -    if (!acc_on_device (acc_device_nvidia))
> > +    if (!ACC_ON_DEVICE (acc_device_nvidia))
> >        abort ();
> >  #else
> > -    if (acc_on_device (acc_device_nvidia))
> > +    if (ACC_ON_DEVICE (acc_device_nvidia))
> >        abort ();
> >  #endif
> >    }
> >  
> > -#endif
> >  
> >    return 0;
> >  }
> > diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
> > new file mode 100644
> > index 0000000..c3b3378
> > --- /dev/null
> > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
> > @@ -0,0 +1,21 @@
> > +/* With the acc_on_device builtin enabled, we don't expect any calls to the
> > +   libgomp library function.  */
> > +/* { dg-additional-options "-fdump-rtl-expand" }
> > +   { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 0 "expand" } } */
> > +
> > +#include <openacc.h>
> > +
> > +#ifdef __OPTIMIZE__
> > +# define ACC_ON_DEVICE acc_on_device
> > +#else
> > +/* Without optimizations enabled, we're not expecting the acc_on_device builtin
> > +   to be used, so use here a "-O2" wrapper.  */
> > +#pragma acc routine seq
> > +static int __attribute__ ((optimize ("O2")))
> > +ACC_ON_DEVICE (acc_device_t arg)
> > +{
> > +  return acc_on_device (arg);
> > +}
> > +#endif
> > +
> > +#include "acc_on_device-1.c"
> > diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> > index 8308f7c..1c48ab3 100644
> > --- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> > +++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
> > @@ -4,14 +4,12 @@
> >  #include <limits.h>
> >  #include <openacc.h>
> >  
> > -/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
> > -   not behaving as expected for -O0.  */
> >  #pragma acc routine seq
> >  static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
> >  {
> > -  if (acc_on_device ((int) acc_device_host))
> > +  if (acc_on_device (acc_device_host))
> >      return 0;
> > -  else if (acc_on_device ((int) acc_device_nvidia))
> > +  else if (acc_on_device (acc_device_nvidia))
> >      {
> >        unsigned int r;
> >        asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
> > @@ -24,9 +22,9 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
> >  #pragma acc routine seq
> >  static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
> >  {
> > -  if (acc_on_device ((int) acc_device_host))
> > +  if (acc_on_device (acc_device_host))
> >      return 0;
> > -  else if (acc_on_device ((int) acc_device_nvidia))
> > +  else if (acc_on_device (acc_device_nvidia))
> >      {
> >        unsigned int r;
> >        asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
> > @@ -39,9 +37,9 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
> >  #pragma acc routine seq
> >  static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
> >  {
> > -  if (acc_on_device ((int) acc_device_host))
> > +  if (acc_on_device (acc_device_host))
> >      return 0;
> > -  else if (acc_on_device ((int) acc_device_nvidia))
> > +  else if (acc_on_device (acc_device_nvidia))
> >      {
> >        unsigned int r;
> >        asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));


Grüße
 Thomas


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]