This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[UPC 20/22] libgupc runtime library [7/9]


[NOTE: Due to email list size limits, this patch is broken into 9 parts.]

Background
----------

An overview email, describing the UPC-related changes is here:
  https://gcc.gnu.org/ml/gcc-patches/2015-12/msg00005.html

The GUPC branch is described here:
  http://gcc.gnu.org/projects/gupc.html

The UPC-related source code differences are summarized here:
  http://gccupc.org/gupc-changes

All languages (c, c++, fortran, go, lto, objc, obj-c++) have been
bootstrapped; no test suite regressions were introduced,
relative to the GCC trunk.

If you are on the cc-list, your name was chosen either
because you are listed as a maintainer for the area that
applies to the patches described in this email, or you
were a frequent contributor of patches made to files listed
in this email.

In the change log entries included in each patch, the directory
containing the affected files is listed, followed by the files.
When the patches are applied, the change log entries will be
distributed to the appropriate ChangeLog file.

Overview
--------

Libgupc is the UPC runtime library, for GUPC.  The configuration,
makefile, and documentation related changes have been broken out into
separate patches.

As noted in the ChangeLog entry below, this is all new code.
Two communication layers are supported: (1) SMP, via 'mmap'
or (2) the Portals4 library API, which supports multi-node
operation.  Libgupc generally requires a POSIX-compliant target OS.

The 'smp' runtime is the default runtime.  The 'portals4'
runtime is experimental; it supports multi-node operation
using the Portals4 communications library.

Most of the libgupc/include/ directory contains standard headers
defined by the UPC language specification. 'make install' will
install these headers in the directory where other "C"
header files are located.

2015-11-30  Gary Funck  <gary@intrepid.com>

	libgupc/portals4/
	* gupcr_access.c: New.
	* gupcr_access.h: New.
	* gupcr_addr.c: New.
	* gupcr_alloc.h: New.
	* gupcr_alloc.upc: New.
	* gupcr_atomic.upc: New.
	* gupcr_atomic_sup.c: New.
	* gupcr_atomic_sup.h: New.
	* gupcr_backtrace.c: New.
	* gupcr_backtrace.h: New.
	* gupcr_barrier.c: New.
	* gupcr_barrier.h: New.
	* gupcr_broadcast.c: New.
	* gupcr_broadcast.h: New.
	* gupcr_castable.upc: New.

Index: libgupc/portals4/gupcr_access.c
===================================================================
--- libgupc/portals4/gupcr_access.c	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_access.c	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,1851 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_access.h"
+#include "gupcr_sync.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_node.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+
+/**
+ * @file gupcr_access.c
+ * GUPC compiler access functions.
+ */
+
+/**
+ * @addtogroup IFACE GUPC Interface Routines
+ * @{
+ */
+
+//begin lib_inline_access
+
+/**
+ * Relaxed shared "char (8 bits)" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Char (8 bits) value at the shared address given by 'p'.
+ */
+//inline
+u_intQI_t
+__getqi2 (upc_shared_ptr_t p)
+{
+  u_intQI_t result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R QI LOCAL");
+      result = *(u_intQI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R QI REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx 0x%x",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Relaxed shared "short (16 bits)" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Short (16 bits) value at the shared address given by 'p'.
+ */
+//inline
+u_intHI_t
+__gethi2 (upc_shared_ptr_t p)
+{
+  u_intHI_t result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R HI LOCAL");
+      result = *(u_intHI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R HI REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx 0x%x",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Relaxed shared "int (32 bits)" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Int (32 bits) value at the shared address given by 'p'.
+ */
+//inline
+u_intSI_t
+__getsi2 (upc_shared_ptr_t p)
+{
+  u_intSI_t result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R SI LOCAL");
+      result = *(u_intSI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R SI REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx 0x%x",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Relaxed shared "long (64 bits)" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Long (64 bits) value at the shared address given by 'p'.
+ */
+//inline
+u_intDI_t
+__getdi2 (upc_shared_ptr_t p)
+{
+  u_intDI_t result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R DI LOCAL");
+      result = *(u_intDI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R DI REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx 0x%llx",
+	       thread, (long unsigned) offset, (long long unsigned) result);
+  return result;
+}
+
+#if GUPCR_TARGET64
+/**
+ * Relaxed shared "long long (128 bits)" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Long long (128 bits) value at the shared address given by 'p'.
+ */
+//inline
+u_intTI_t
+__getti2 (upc_shared_ptr_t p)
+{
+  u_intTI_t result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R TI LOCAL");
+      result = *(u_intTI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R TI REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx 0x%llx",
+	       thread, (long unsigned) offset, (long long unsigned) result);
+  return result;
+}
+#endif /* GUPCR_TARGET64 */
+/**
+ * Relaxed shared "float" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Float value at the shared address given by 'p'.
+ */
+//inline
+float
+__getsf2 (upc_shared_ptr_t p)
+{
+  float result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R SF LOCAL");
+      result = *(float *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R SF REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx %6g",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Relaxed shared "double" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Double value at the shared address given by 'p'.
+ */
+//inline
+double
+__getdf2 (upc_shared_ptr_t p)
+{
+  double result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R DF LOCAL");
+      result = *(double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R DF REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx %6g",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Relaxed shared "long double" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Long double value at the shared address given by 'p'.
+ */
+//inline
+long double
+__gettf2 (upc_shared_ptr_t p)
+{
+  long double result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R TF LOCAL");
+      result = *(long double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R TF REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx %6Lg",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Relaxed shared "long double" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Long double value at the shared address given by 'p'.
+ */
+//inline
+long double
+__getxf2 (upc_shared_ptr_t p)
+{
+  long double result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R XF LOCAL");
+      result = *(long double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER R XF REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx %6Lg",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Relaxed shared memory block get operation.
+ * Copy the data at the shared address 'src' into the local memory
+ * destination at the address 'dest'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] dest Local address of the destination memory block.
+ * @param [in] src Shared address of the source memory block.
+ * @param [in] n Number of bytes to transfer.
+ */
+//inline
+void
+__getblk3 (void *dest, upc_shared_ptr_t src, size_t n)
+{
+  int thread = GUPCR_PTS_THREAD (src);
+  size_t offset = GUPCR_PTS_OFFSET (src);
+  gupcr_trace (FC_MEM, "GETBLK ENTER R");
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      GUPCR_MEM_BARRIER ();
+      memcpy (dest, GUPCR_GMEM_OFF_TO_LOCAL (thread, offset), n);
+      GUPCR_READ_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_gmem_get (dest, thread, offset, n);
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GETBLK EXIT R %d:0x%lx 0x%lx %lu",
+	       thread, (long unsigned) offset,
+	       (long unsigned) dest, (long unsigned) n);
+}
+
+/**
+ * Relaxed shared "char (8 bits)" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putqi2 (upc_shared_ptr_t p, u_intQI_t v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R QI LOCAL "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      *(u_intQI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R QI REMOTE "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	  /* There can be only one outstanding unordered put.  */
+	  gupcr_pending_strict_put = 1;
+	}
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT R QI");
+}
+
+/**
+ * Relaxed shared "short (16 bits)" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__puthi2 (upc_shared_ptr_t p, u_intHI_t v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R HI LOCAL "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      *(u_intHI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R HI REMOTE "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	  /* There can be only one outstanding unordered put.  */
+	  gupcr_pending_strict_put = 1;
+	}
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT R HI");
+}
+
+/**
+ * Relaxed shared "int (32 bits)" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putsi2 (upc_shared_ptr_t p, u_intSI_t v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R SI LOCAL "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      *(u_intSI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R SI REMOTE "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	  /* There can be only one outstanding unordered put.  */
+	  gupcr_pending_strict_put = 1;
+	}
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT R SI");
+}
+
+/**
+ * Relaxed shared "long (64 bits)" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putdi2 (upc_shared_ptr_t p, u_intDI_t v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R DI LOCAL "
+		   "0x%llx %d:0x%lx",
+		   (long long unsigned) v, thread, (long unsigned) offset);
+      *(u_intDI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R DI REMOTE "
+		   "0x%llx %d:0x%lx",
+		   (long long unsigned) v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	  /* There can be only one outstanding unordered put.  */
+	  gupcr_pending_strict_put = 1;
+	}
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT R DI");
+}
+
+#if GUPCR_TARGET64
+/**
+ * Relaxed shared "long long (128 bits)" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putti2 (upc_shared_ptr_t p, u_intTI_t v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R TI LOCAL "
+		   "0x%llx %d:0x%lx",
+		   (long long unsigned) v, thread, (long unsigned) offset);
+      *(u_intTI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R TI REMOTE "
+		   "0x%llx %d:0x%lx",
+		   (long long unsigned) v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	  /* There can be only one outstanding unordered put.  */
+	  gupcr_pending_strict_put = 1;
+	}
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT R TI");
+}
+#endif /* GUPCR_TARGET64 */
+/**
+ * Relaxed shared "float" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putsf2 (upc_shared_ptr_t p, float v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R SF LOCAL "
+		   "%6g %d:0x%lx", v, thread, (long unsigned) offset);
+      *(float *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R SF REMOTE "
+		   "%6g %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	  /* There can be only one outstanding unordered put.  */
+	  gupcr_pending_strict_put = 1;
+	}
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT R SF");
+}
+
+/**
+ * Relaxed shared "double" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putdf2 (upc_shared_ptr_t p, double v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R DF LOCAL "
+		   "%6g %d:0x%lx", v, thread, (long unsigned) offset);
+      *(double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R DF REMOTE "
+		   "%6g %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	  /* There can be only one outstanding unordered put.  */
+	  gupcr_pending_strict_put = 1;
+	}
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT R DF");
+}
+
+/**
+ * Relaxed shared "long double" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__puttf2 (upc_shared_ptr_t p, long double v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R TF LOCAL "
+		   "%6Lg %d:0x%lx", v, thread, (long unsigned) offset);
+      *(long double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R TF REMOTE "
+		   "%6Lg %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	  /* There can be only one outstanding unordered put.  */
+	  gupcr_pending_strict_put = 1;
+	}
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT R TF");
+}
+
+/**
+ * Relaxed shared "long double" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putxf2 (upc_shared_ptr_t p, long double v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R XF LOCAL "
+		   "%6Lg %d:0x%lx", v, thread, (long unsigned) offset);
+      *(long double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER R XF REMOTE "
+		   "%6Lg %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	  /* There can be only one outstanding unordered put.  */
+	  gupcr_pending_strict_put = 1;
+	}
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT R XF");
+}
+
+/**
+ * Relaxed shared memory block put operation.
+ * Copy the data at the local address 'src' into the shared memory
+ * destination at the address 'dest'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] dest Shared address of the destination memory block.
+ * @param [in] src Local address of the source memory block.
+ * @param [in] n Number of bytes to transfer.
+ */
+//inline
+void
+__putblk3 (upc_shared_ptr_t dest, void *src, size_t n)
+{
+  int thread = GUPCR_PTS_THREAD (dest);
+  size_t offset = GUPCR_PTS_OFFSET (dest);
+  gupcr_trace (FC_MEM, "PUTBLK ENTER R 0x%lx %d:0x%lx %lu",
+	       (long unsigned) src, thread,
+	       (long unsigned) offset, (long unsigned) n);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      memcpy (GUPCR_GMEM_OFF_TO_LOCAL (thread, offset), src, n);
+    }
+  else
+    {
+      gupcr_gmem_put (thread, offset, src, n);
+    }
+  gupcr_trace (FC_MEM, "PUT_BLK EXIT R");
+}
+
+/**
+ * Relaxed shared memory block copy operation.
+ * Copy the data at the shared address 'src' into the shared memory
+ * destination at the address 'dest'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] dest Shared address of destination memory block.
+ * @param [in] src Shared address of source memory block.
+ * @param [in] n Number of bytes to transfer.
+ */
+//inline
+void
+__copyblk3 (upc_shared_ptr_t dest, upc_shared_ptr_t src, size_t n)
+{
+  int dthread = GUPCR_PTS_THREAD (dest);
+  size_t doffset = GUPCR_PTS_OFFSET (dest);
+  int sthread = GUPCR_PTS_THREAD (src);
+  size_t soffset = GUPCR_PTS_OFFSET (src);
+  gupcr_trace (FC_MEM, "COPYBLK ENTER R %d:0x%lx %d:0x%lx %lu",
+	       sthread, (long unsigned) soffset,
+	       dthread, (long unsigned) doffset, (long unsigned) n);
+  gupcr_assert (dthread < THREADS);
+  gupcr_assert (doffset != 0);
+  gupcr_assert (sthread < THREADS);
+  gupcr_assert (soffset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (dthread) && GUPCR_GMEM_IS_LOCAL (sthread))
+    {
+      memcpy (GUPCR_GMEM_OFF_TO_LOCAL (dthread, doffset),
+	      GUPCR_GMEM_OFF_TO_LOCAL (sthread, soffset), n);
+    }
+  else if (GUPCR_GMEM_IS_LOCAL (dthread))
+    {
+      gupcr_gmem_get (GUPCR_GMEM_OFF_TO_LOCAL (dthread, doffset),
+		      sthread, soffset, n);
+      gupcr_gmem_sync_gets ();
+    }
+  else if (GUPCR_GMEM_IS_LOCAL (sthread))
+    {
+      gupcr_gmem_put (dthread, doffset,
+		      GUPCR_GMEM_OFF_TO_LOCAL (sthread, soffset), n);
+    }
+  else
+    {
+      gupcr_gmem_copy (dthread, doffset, sthread, soffset, n);
+    }
+  gupcr_trace (FC_MEM, "COPY_BLK EXIT R");
+}
+
+/**
+ * Strict shared "char (8 bits)" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Char (8 bits) value at the shared address given by 'p'.
+ */
+//inline
+u_intQI_t
+__getsqi2 (upc_shared_ptr_t p)
+{
+  u_intQI_t result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S QI LOCAL");
+      GUPCR_MEM_BARRIER ();
+      result = *(u_intQI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+      GUPCR_READ_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S QI REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx 0x%x",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Strict shared "short (16 bits)" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Short (16 bits) value at the shared address given by 'p'.
+ */
+//inline
+u_intHI_t
+__getshi2 (upc_shared_ptr_t p)
+{
+  u_intHI_t result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S HI LOCAL");
+      GUPCR_MEM_BARRIER ();
+      result = *(u_intHI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+      GUPCR_READ_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S HI REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx 0x%x",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Strict shared "int (32 bits)" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Int (32 bits) value at the shared address given by 'p'.
+ */
+//inline
+u_intSI_t
+__getssi2 (upc_shared_ptr_t p)
+{
+  u_intSI_t result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S SI LOCAL");
+      GUPCR_MEM_BARRIER ();
+      result = *(u_intSI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+      GUPCR_READ_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S SI REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx 0x%x",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Strict shared "long (64 bits)" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Long (64 bits) value at the shared address given by 'p'.
+ */
+//inline
+u_intDI_t
+__getsdi2 (upc_shared_ptr_t p)
+{
+  u_intDI_t result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S DI LOCAL");
+      GUPCR_MEM_BARRIER ();
+      result = *(u_intDI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+      GUPCR_READ_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S DI REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx 0x%llx",
+	       thread, (long unsigned) offset, (long long unsigned) result);
+  return result;
+}
+
+#if GUPCR_TARGET64
+/**
+ * Strict shared "long long (128 bits)" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Long long (128 bits) value at the shared address given by 'p'.
+ */
+//inline
+u_intTI_t
+__getsti2 (upc_shared_ptr_t p)
+{
+  u_intTI_t result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S TI LOCAL");
+      GUPCR_MEM_BARRIER ();
+      result = *(u_intTI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+      GUPCR_READ_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S TI REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx 0x%llx",
+	       thread, (long unsigned) offset, (long long unsigned) result);
+  return result;
+}
+#endif /* GUPCR_TARGET64 */
+/**
+ * Strict shared "float" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Float value at the shared address given by 'p'.
+ */
+//inline
+float
+__getssf2 (upc_shared_ptr_t p)
+{
+  float result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S SF LOCAL");
+      GUPCR_MEM_BARRIER ();
+      result = *(float *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+      GUPCR_READ_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S SF REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx %6g",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Strict shared "double" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Double value at the shared address given by 'p'.
+ */
+//inline
+double
+__getsdf2 (upc_shared_ptr_t p)
+{
+  double result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S DF LOCAL");
+      GUPCR_MEM_BARRIER ();
+      result = *(double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+      GUPCR_READ_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S DF REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx %6g",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Strict shared "long double" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Long double value at the shared address given by 'p'.
+ */
+//inline
+long double
+__getstf2 (upc_shared_ptr_t p)
+{
+  long double result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S TF LOCAL");
+      GUPCR_MEM_BARRIER ();
+      result = *(long double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+      GUPCR_READ_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S TF REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx %6Lg",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Strict shared "long double" get operation.
+ * Return the value at the shared address 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the source operand.
+ * @return Long double value at the shared address given by 'p'.
+ */
+//inline
+long double
+__getsxf2 (upc_shared_ptr_t p)
+{
+  long double result;
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S XF LOCAL");
+      GUPCR_MEM_BARRIER ();
+      result = *(long double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+      GUPCR_READ_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "GET ENTER S XF REMOTE");
+      gupcr_gmem_get (&result, thread, offset, sizeof (result));
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GET EXIT %d:0x%lx %6Lg",
+	       thread, (long unsigned) offset, result);
+  return result;
+}
+
+/**
+ * Strict shared memory block get operation.
+ * Copy the data at the shared address 'src' into the local memory
+ * destination at the address 'dest'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] dest Local address of the destination memory block.
+ * @param [in] src Shared address of the source memory block.
+ * @param [in] n Number of bytes to transfer.
+ */
+//inline
+void
+__getsblk3 (void *dest, upc_shared_ptr_t src, size_t n)
+{
+  int thread = GUPCR_PTS_THREAD (src);
+  size_t offset = GUPCR_PTS_OFFSET (src);
+  gupcr_trace (FC_MEM, "GETBLK ENTER S");
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      GUPCR_MEM_BARRIER ();
+      memcpy (dest, GUPCR_GMEM_OFF_TO_LOCAL (thread, offset), n);
+      GUPCR_READ_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_gmem_get (dest, thread, offset, n);
+      /* All 'get' operations are synchronous.  */
+      gupcr_gmem_sync_gets ();
+    }
+  gupcr_trace (FC_MEM, "GETBLK EXIT S %d:0x%lx 0x%lx %lu",
+	       thread, (long unsigned) offset,
+	       (long unsigned) dest, (long unsigned) n);
+}
+
+/**
+ * Strict shared "char (8 bits)" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putsqi2 (upc_shared_ptr_t p, u_intQI_t v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S QI LOCAL "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      GUPCR_WRITE_MEM_BARRIER ();
+      *(u_intQI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+      GUPCR_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S QI REMOTE "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      gupcr_pending_strict_put = 1;
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT S QI");
+}
+
+/**
+ * Strict shared "short (16 bits)" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putshi2 (upc_shared_ptr_t p, u_intHI_t v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S HI LOCAL "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      GUPCR_WRITE_MEM_BARRIER ();
+      *(u_intHI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+      GUPCR_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S HI REMOTE "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      gupcr_pending_strict_put = 1;
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT S HI");
+}
+
+/**
+ * Strict shared "int (32 bits)" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putssi2 (upc_shared_ptr_t p, u_intSI_t v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S SI LOCAL "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      GUPCR_WRITE_MEM_BARRIER ();
+      *(u_intSI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+      GUPCR_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S SI REMOTE "
+		   "0x%x %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      gupcr_pending_strict_put = 1;
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT S SI");
+}
+
+/**
+ * Strict shared "long (64 bits)" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putsdi2 (upc_shared_ptr_t p, u_intDI_t v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S DI LOCAL "
+		   "0x%llx %d:0x%lx",
+		   (long long unsigned) v, thread, (long unsigned) offset);
+      GUPCR_WRITE_MEM_BARRIER ();
+      *(u_intDI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+      GUPCR_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S DI REMOTE "
+		   "0x%llx %d:0x%lx",
+		   (long long unsigned) v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      gupcr_pending_strict_put = 1;
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT S DI");
+}
+
+#if GUPCR_TARGET64
+/**
+ * Strict shared "long long (128 bits)" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putsti2 (upc_shared_ptr_t p, u_intTI_t v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S TI LOCAL "
+		   "0x%llx %d:0x%lx",
+		   (long long unsigned) v, thread, (long unsigned) offset);
+      GUPCR_WRITE_MEM_BARRIER ();
+      *(u_intTI_t *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+      GUPCR_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S TI REMOTE "
+		   "0x%llx %d:0x%lx",
+		   (long long unsigned) v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      gupcr_pending_strict_put = 1;
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT S TI");
+}
+#endif /* GUPCR_TARGET64 */
+/**
+ * Strict shared "float" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putssf2 (upc_shared_ptr_t p, float v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S SF LOCAL "
+		   "%6g %d:0x%lx", v, thread, (long unsigned) offset);
+      GUPCR_WRITE_MEM_BARRIER ();
+      *(float *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+      GUPCR_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S SF REMOTE "
+		   "%6g %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      gupcr_pending_strict_put = 1;
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT S SF");
+}
+
+/**
+ * Strict shared "double" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putsdf2 (upc_shared_ptr_t p, double v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S DF LOCAL "
+		   "%6g %d:0x%lx", v, thread, (long unsigned) offset);
+      GUPCR_WRITE_MEM_BARRIER ();
+      *(double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+      GUPCR_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S DF REMOTE "
+		   "%6g %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      gupcr_pending_strict_put = 1;
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT S DF");
+}
+
+/**
+ * Strict shared "long double" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putstf2 (upc_shared_ptr_t p, long double v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S TF LOCAL "
+		   "%6Lg %d:0x%lx", v, thread, (long unsigned) offset);
+      GUPCR_WRITE_MEM_BARRIER ();
+      *(long double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+      GUPCR_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S TF REMOTE "
+		   "%6Lg %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      gupcr_pending_strict_put = 1;
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT S TF");
+}
+
+/**
+ * Strict shared "long double" put operation.
+ * Store the value given by 'v' into the shared memory destination at 'p'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] p Shared address of the destination address.
+ * @param [in] v Source value.
+ */
+//inline
+void
+__putsxf2 (upc_shared_ptr_t p, long double v)
+{
+  int thread = GUPCR_PTS_THREAD (p);
+  size_t offset = GUPCR_PTS_OFFSET (p);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S XF LOCAL "
+		   "%6Lg %d:0x%lx", v, thread, (long unsigned) offset);
+      GUPCR_WRITE_MEM_BARRIER ();
+      *(long double *) GUPCR_GMEM_OFF_TO_LOCAL (thread, offset) = v;
+      GUPCR_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_trace (FC_MEM, "PUT ENTER S XF REMOTE "
+		   "%6Lg %d:0x%lx", v, thread, (long unsigned) offset);
+      if (sizeof (v) <= (size_t) GUPCR_PORTALS_MAX_ORDERED_SIZE)
+	{
+	  /* Ordered puts can proceed in parallel.  */
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      else
+	{
+	  /* Wait for any outstanding 'put' operation.  */
+	  gupcr_gmem_sync_puts ();
+	  gupcr_gmem_put (thread, offset, &v, sizeof (v));
+	}
+      gupcr_pending_strict_put = 1;
+    }
+  gupcr_trace (FC_MEM, "PUT EXIT S XF");
+}
+
+/**
+ * Strict shared memory block put operation.
+ * Copy the data at the local address 'src' into the shared memory
+ * destination at the address 'dest'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] dest Shared address of the destination memory block.
+ * @param [in] src Local address of the source memory block.
+ * @param [in] n Number of bytes to transfer.
+ */
+//inline
+void
+__putsblk3 (upc_shared_ptr_t dest, void *src, size_t n)
+{
+  int thread = GUPCR_PTS_THREAD (dest);
+  size_t offset = GUPCR_PTS_OFFSET (dest);
+  gupcr_trace (FC_MEM, "PUTBLK ENTER S 0x%lx %d:0x%lx %lu",
+	       (long unsigned) src, thread,
+	       (long unsigned) offset, (long unsigned) n);
+  gupcr_assert (thread < THREADS);
+  gupcr_assert (offset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      GUPCR_WRITE_MEM_BARRIER ();
+      memcpy (GUPCR_GMEM_OFF_TO_LOCAL (thread, offset), src, n);
+      GUPCR_MEM_BARRIER ();
+    }
+  else
+    {
+      gupcr_gmem_put (thread, offset, src, n);
+      gupcr_pending_strict_put = 1;
+    }
+  gupcr_trace (FC_MEM, "PUT_BLK EXIT S");
+}
+
+/**
+ * Strict shared memory block copy operation.
+ * Copy the data at the shared address 'src' into the shared memory
+ * destination at the address 'dest'.
+ *
+ * The interface to this procedure is defined by the UPC compiler API.
+ *
+ * @param [in] dest Shared address of destination memory block.
+ * @param [in] src Shared address of source memory block.
+ * @param [in] n Number of bytes to transfer.
+ */
+//inline
+void
+__copysblk3 (upc_shared_ptr_t dest, upc_shared_ptr_t src, size_t n)
+{
+  int dthread = GUPCR_PTS_THREAD (dest);
+  size_t doffset = GUPCR_PTS_OFFSET (dest);
+  int sthread = GUPCR_PTS_THREAD (src);
+  size_t soffset = GUPCR_PTS_OFFSET (src);
+  gupcr_trace (FC_MEM, "COPYBLK ENTER S %d:0x%lx %d:0x%lx %lu",
+	       sthread, (long unsigned) soffset,
+	       dthread, (long unsigned) doffset, (long unsigned) n);
+  gupcr_assert (dthread < THREADS);
+  gupcr_assert (doffset != 0);
+  gupcr_assert (sthread < THREADS);
+  gupcr_assert (soffset != 0);
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+  if (GUPCR_GMEM_IS_LOCAL (dthread) && GUPCR_GMEM_IS_LOCAL (sthread))
+    {
+      GUPCR_WRITE_MEM_BARRIER ();
+      memcpy (GUPCR_GMEM_OFF_TO_LOCAL (dthread, doffset),
+	      GUPCR_GMEM_OFF_TO_LOCAL (sthread, soffset), n);
+      GUPCR_MEM_BARRIER ();
+    }
+  else if (GUPCR_GMEM_IS_LOCAL (dthread))
+    {
+      gupcr_gmem_get (GUPCR_GMEM_OFF_TO_LOCAL (dthread, doffset),
+		      sthread, soffset, n);
+      gupcr_gmem_sync_gets ();
+    }
+  else if (GUPCR_GMEM_IS_LOCAL (sthread))
+    {
+      gupcr_gmem_put (dthread, doffset,
+		      GUPCR_GMEM_OFF_TO_LOCAL (sthread, soffset), n);
+      gupcr_pending_strict_put = 1;
+    }
+  else
+    {
+      gupcr_gmem_copy (dthread, doffset, sthread, soffset, n);
+      gupcr_pending_strict_put = 1;
+    }
+  gupcr_trace (FC_MEM, "COPY_BLK EXIT S");
+}
+
+/**
+ * upc_fence implementation.
+ */
+//inline
+void
+__upc_fence (void)
+{
+  GUPCR_MEM_BARRIER ();
+  gupcr_gmem_sync ();
+}
+
+//end lib_inline_access
+/** @} */
Index: libgupc/portals4/gupcr_access.h
===================================================================
--- libgupc/portals4/gupcr_access.h	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_access.h	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,179 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+#ifndef _GUPCR_ACCESS_H_
+#define _GUPCR_ACCESS_H_
+
+/**
+ * @file gupcr_access.h
+ * GUPC compiler access functions prototypes.
+ */
+
+//begin lib_access_prototypes
+/* Relaxed accesses.  */
+
+extern u_intQI_t __getqi2 (upc_shared_ptr_t);
+extern u_intHI_t __gethi2 (upc_shared_ptr_t);
+extern u_intSI_t __getsi2 (upc_shared_ptr_t);
+extern u_intDI_t __getdi2 (upc_shared_ptr_t);
+#if GUPCR_TARGET64
+extern u_intTI_t __getti2 (upc_shared_ptr_t);
+#endif
+extern float __getsf2 (upc_shared_ptr_t);
+extern double __getdf2 (upc_shared_ptr_t);
+extern long double __gettf2 (upc_shared_ptr_t);
+extern long double __getxf2 (upc_shared_ptr_t);
+extern void __getblk3 (void *, upc_shared_ptr_t, size_t);
+
+extern void __putqi2 (upc_shared_ptr_t, u_intQI_t);
+extern void __puthi2 (upc_shared_ptr_t, u_intHI_t);
+extern void __putsi2 (upc_shared_ptr_t, u_intSI_t);
+extern void __putdi2 (upc_shared_ptr_t, u_intDI_t);
+#if GUPCR_TARGET64
+extern void __putti2 (upc_shared_ptr_t, u_intTI_t);
+#endif
+extern void __putsf2 (upc_shared_ptr_t, float);
+extern void __putdf2 (upc_shared_ptr_t, double);
+extern void __puttf2 (upc_shared_ptr_t, long double);
+extern void __putxf2 (upc_shared_ptr_t, long double);
+extern void __putblk3 (upc_shared_ptr_t, void *, size_t);
+extern void __copyblk3 (upc_shared_ptr_t, upc_shared_ptr_t, size_t);
+
+/* Strict accesses.  */
+
+extern u_intQI_t __getsqi2 (upc_shared_ptr_t);
+extern u_intHI_t __getshi2 (upc_shared_ptr_t);
+extern u_intSI_t __getssi2 (upc_shared_ptr_t);
+extern u_intDI_t __getsdi2 (upc_shared_ptr_t);
+#if GUPCR_TARGET64
+extern u_intTI_t __getsti2 (upc_shared_ptr_t);
+#endif
+extern float __getssf2 (upc_shared_ptr_t);
+extern double __getsdf2 (upc_shared_ptr_t);
+extern long double __getstf2 (upc_shared_ptr_t);
+extern long double __getsxf2 (upc_shared_ptr_t);
+extern void __getsblk3 (void *, upc_shared_ptr_t, size_t);
+
+extern void __putsqi2 (upc_shared_ptr_t, u_intQI_t);
+extern void __putshi2 (upc_shared_ptr_t, u_intHI_t);
+extern void __putssi2 (upc_shared_ptr_t, u_intSI_t);
+extern void __putsdi2 (upc_shared_ptr_t, u_intDI_t);
+#if GUPCR_TARGET64
+extern void __putsti2 (upc_shared_ptr_t, u_intTI_t);
+#endif
+extern void __putssf2 (upc_shared_ptr_t, float);
+extern void __putsdf2 (upc_shared_ptr_t, double);
+extern void __putstf2 (upc_shared_ptr_t, long double);
+extern void __putsxf2 (upc_shared_ptr_t, long double);
+extern void __putsblk3 (upc_shared_ptr_t, void *, size_t);
+extern void __copysblk3 (upc_shared_ptr_t, upc_shared_ptr_t, size_t);
+
+/* Relaxed accesses (profiled).  */
+
+extern u_intQI_t __getgqi3 (upc_shared_ptr_t, const char *file, int line);
+extern u_intHI_t __getghi3 (upc_shared_ptr_t, const char *file, int line);
+extern u_intSI_t __getgsi3 (upc_shared_ptr_t, const char *file, int line);
+extern u_intDI_t __getgdi3 (upc_shared_ptr_t, const char *file, int line);
+#if GUPCR_TARGET64
+extern u_intTI_t __getgti3 (upc_shared_ptr_t, const char *file, int line);
+#endif
+extern float __getgsf3 (upc_shared_ptr_t, const char *file, int line);
+extern double __getgdf3 (upc_shared_ptr_t, const char *file, int line);
+extern long double __getgtf3 (upc_shared_ptr_t, const char *file, int line);
+extern long double __getgxf3 (upc_shared_ptr_t, const char *file, int line);
+extern void __getgblk5 (void *, upc_shared_ptr_t, size_t, const char *file,
+			int line);
+
+extern void __putgqi4 (upc_shared_ptr_t, u_intQI_t, const char *file,
+		       int line);
+extern void __putghi4 (upc_shared_ptr_t, u_intHI_t, const char *file,
+		       int line);
+extern void __putgsi4 (upc_shared_ptr_t, u_intSI_t, const char *file,
+		       int line);
+extern void __putgdi4 (upc_shared_ptr_t, u_intDI_t, const char *file,
+		       int line);
+#if GUPCR_TARGET64
+extern void __putgti4 (upc_shared_ptr_t, u_intTI_t, const char *file,
+		       int line);
+#endif
+extern void __putgsf4 (upc_shared_ptr_t, float, const char *file, int line);
+extern void __putgdf4 (upc_shared_ptr_t, double, const char *file, int line);
+extern void __putgtf4 (upc_shared_ptr_t, long double, const char *file,
+		       int line);
+extern void __putgxf4 (upc_shared_ptr_t, long double, const char *file,
+		       int line);
+extern void __putgblk5 (upc_shared_ptr_t, void *, size_t, const char *file,
+			int line);
+extern void __copygblk5 (upc_shared_ptr_t, upc_shared_ptr_t, size_t,
+			 const char *file, int line);
+
+/* Strict accesses (profiled).  */
+
+extern u_intQI_t __getsgqi3 (upc_shared_ptr_t, const char *file, int line);
+extern u_intHI_t __getsghi3 (upc_shared_ptr_t, const char *file, int line);
+extern u_intSI_t __getsgsi3 (upc_shared_ptr_t, const char *file, int line);
+extern u_intDI_t __getsgdi3 (upc_shared_ptr_t, const char *file, int line);
+#if GUPCR_TARGET64
+extern u_intTI_t __getsgti3 (upc_shared_ptr_t, const char *file, int line);
+#endif
+extern float __getsgsf3 (upc_shared_ptr_t, const char *file, int line);
+extern double __getsgdf3 (upc_shared_ptr_t, const char *file, int line);
+extern long double __getsgtf3 (upc_shared_ptr_t, const char *file, int line);
+extern long double __getsgxf3 (upc_shared_ptr_t, const char *file, int line);
+extern void __getsgblk5 (void *, upc_shared_ptr_t, size_t, const char *file,
+			 int line);
+
+extern void __putsgqi4 (upc_shared_ptr_t, u_intQI_t, const char *file,
+			int line);
+extern void __putsghi4 (upc_shared_ptr_t, u_intHI_t, const char *file,
+			int line);
+extern void __putsgsi4 (upc_shared_ptr_t, u_intSI_t, const char *file,
+			int line);
+extern void __putsgdi4 (upc_shared_ptr_t, u_intDI_t, const char *file,
+			int line);
+#if GUPCR_TARGET64
+extern void __putsgti4 (upc_shared_ptr_t, u_intTI_t, const char *file,
+			int line);
+#endif
+extern void __putsgsf4 (upc_shared_ptr_t, float, const char *file, int line);
+extern void __putsgdf4 (upc_shared_ptr_t, double, const char *file, int line);
+extern void __putsgtf4 (upc_shared_ptr_t, long double, const char *file,
+			int line);
+extern void __putsgxf4 (upc_shared_ptr_t, long double, const char *file,
+			int line);
+extern void __putsgblk5 (upc_shared_ptr_t, void *, size_t, const char *file,
+			 int line);
+extern void __copysgblk5 (upc_shared_ptr_t, upc_shared_ptr_t, size_t,
+			  const char *file, int line);
+
+/* Miscellaneous access related prototypes.  */
+extern void __upc_fence (void);
+
+//end lib_access_prototypes
+
+
+#endif /* gupcr_access.h */
Index: libgupc/portals4/gupcr_addr.c
===================================================================
--- libgupc/portals4/gupcr_addr.c	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_addr.c	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,195 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_node.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+
+/**
+ * @file gupcr_addr.c
+ * GUPC Portals4 shared address utility routines
+ */
+
+/**
+ * @addtogroup IFACE GUPC Interface Routines
+ * @{
+ */
+
+//begin lib_inline_access
+/**
+ * Find local pointer from pointer-to-shared.
+ *
+ * @param [in] p Pointer-to-shared
+ * @return Local address associated with "p"
+ */
+//inline
+void *
+__cvtaddr (upc_shared_ptr_t p)
+{
+  void *addr;
+  if (GUPCR_PTS_IS_NULL (p))
+    return (void *) 0;
+  addr = GUPCR_GMEM_OFF_TO_LOCAL (GUPCR_PTS_THREAD (p), GUPCR_PTS_OFFSET (p));
+  return addr;
+}
+
+//end lib_inline_access
+
+/**
+ * Find local pointer from pointer-to-shared.
+ *
+ * The pointer-to-shared value must have affinity to the current thread.
+ * @param [in] p Pointer-to-shared
+ * @return Local address associated with "p"
+ */
+void *
+__getaddr (upc_shared_ptr_t p)
+{
+  void *addr;
+  if (GUPCR_PTS_IS_NULL (p))
+    return (void *) 0;
+  if ((int) GUPCR_PTS_THREAD (p) != MYTHREAD)
+    gupcr_fatal_error
+      ("invalid conversion of shared address to local pointer;\n"
+       "thread does not have affinity to shared address");
+  addr = GUPCR_GMEM_OFF_TO_LOCAL (MYTHREAD, GUPCR_PTS_OFFSET (p));
+  return addr;
+}
+
+/** @} */
+
+/**
+ * @addtogroup PTSMANIP UPC Pointer-to-shared Manipulation Functions
+ * @{
+ */
+
+/**
+ * Return the thread of a pointer-to-shared value.
+ *
+ * The upc_threadof function returns the index of the thread
+ * that has affinity to the shared object pointed to by the argument.
+ * @param [in] p Pointer-to-shared argument
+ * @retval Thread ID of the argument
+ */
+size_t
+upc_threadof (upc_shared_ptr_t p)
+{
+  if ((int) GUPCR_PTS_THREAD (p) >= THREADS)
+    gupcr_fatal_error ("thread number %d in shared address is out of range",
+		       (int) GUPCR_PTS_THREAD (p));
+  return (size_t) GUPCR_PTS_THREAD (p);
+}
+
+/**
+ * Return the phase of a pointer-to-shared value.
+ *
+ * The upc_phaseof function returns the phase component of the
+ * pointer-to-shared argument.
+ * @param [in] p Pointer-to-shared argument
+ * @retval Phase of the argument
+ */
+size_t
+upc_phaseof (upc_shared_ptr_t p)
+{
+  if ((int) GUPCR_PTS_THREAD (p) >= THREADS)
+    gupcr_fatal_error ("thread number %d in shared address is out of range",
+		       (int) GUPCR_PTS_THREAD (p));
+  return (size_t) GUPCR_PTS_PHASE (p);
+}
+
+/**
+ * Reset the phase field of a pointer-to-shared value.
+ *
+ * The upc_resetphase function returns a pointer-to-shared value which
+ * is identical to its input except that it has zero phase.
+ * @param [in] p Pointer-to-shared argument
+ * @retval Pointer-to-shared with zero phase
+ */
+upc_shared_ptr_t
+upc_resetphase (upc_shared_ptr_t p)
+{
+  upc_shared_ptr_t result;
+  result = p;
+  GUPCR_PTS_SET_PHASE (result, 0);
+  return result;
+}
+
+/**
+ * Return the address field of a pointer-to-shared value.
+ *
+ * The upc_addrfield function returns an implementation-defined
+ * value reflecting the 'local address' of the object pointed to
+ * by the pointer-to-shared argument.
+ * @param [in] p Pointer-to-shared argument
+ * @retval Address field of the argument
+ */
+size_t
+upc_addrfield (upc_shared_ptr_t p)
+{
+  if ((int) GUPCR_PTS_THREAD (p) >= THREADS)
+    gupcr_fatal_error ("thread number %d in shared address is out of range",
+		       (int) GUPCR_PTS_THREAD (p));
+  return (size_t) GUPCR_PTS_VADDR (p);
+}
+
+/**
+ * Return the size of the local portion of the shared data
+ * with a layout described by the input parameters.
+ *
+ * A convenience function which calculates the exact size
+ * of the local portion of the data in a shared object with affinity to
+ * the thread identified by the 'threadid' parameter.
+ * @param [in] totalsize Size of the shared data
+ * @param [in] nbytes Size of the block
+ * @param [in] threadid Requested thread number
+ * @retval Size of the shared space described by the function arguments
+ */
+size_t
+upc_affinitysize (size_t totalsize, size_t nbytes, size_t threadid)
+{
+  size_t result;
+  if (nbytes == 0 || totalsize == 0 || nbytes >= totalsize)
+    result = (size_t) (threadid == 0 ? totalsize : 0);
+  else
+    {
+      size_t const nblocks = (totalsize / nbytes);
+      size_t const cutoff = (nblocks % THREADS);
+      if (threadid < cutoff)
+	result = (size_t) ((nblocks + THREADS - 1) / THREADS) * nbytes;
+      else if (threadid > cutoff)
+	result = (size_t) (nblocks / THREADS) * nbytes;
+      else
+	result = (size_t) ((nblocks / THREADS) * nbytes)
+	  + totalsize - nblocks * nbytes;
+    }
+  return result;
+}
+
+/** @} */
Index: libgupc/portals4/gupcr_alloc.h
===================================================================
--- libgupc/portals4/gupcr_alloc.h	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_alloc.h	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,47 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/**
+ * @file gupcr_alloc.h
+ * GUPC Portals4 UPC dynamic shared memory allocation.
+ */
+
+#ifndef _GUPCR_ALLOC_H_
+#define _GUPCR_ALLOC_H_ 1
+
+extern void gupcr_alloc_init (upc_shared_ptr_t, size_t);
+
+#ifndef __UPC__
+
+extern upc_shared_ptr_t upc_global_alloc (size_t, size_t);
+extern upc_shared_ptr_t upc_all_alloc (size_t, size_t);
+extern upc_shared_ptr_t upc_local_alloc (size_t, size_t);
+extern upc_shared_ptr_t upc_alloc (size_t);
+extern void upc_free (upc_shared_ptr_t);
+
+#endif /* !__UPC__ */
+
+#endif /* gupcr_alloc.h */
Index: libgupc/portals4/gupcr_alloc.upc
===================================================================
--- libgupc/portals4/gupcr_alloc.upc	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_alloc.upc	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,641 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+/**
+ * @file gupcr_alloc.upc
+ * GUPC Portals4 UPC dynamic shared memory allocation.
+ *
+ * Implement UPC's dynamic memory allocation routines.
+ * The implementation is written in UPC, because
+ * it needs to run above the runtime library's memory mapping
+ * facility.  Internal runtime spin locks are used rather than
+ * the UPC language-defined locks, because those locks
+ * depend upon dynamic memory management, and we need to
+ * break the circular dependency.
+ *
+ * @addtogroup ALLOC GUPCR Shared Memory Allocator Functions
+ * @{
+ */
+
+#include <upc.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <assert.h>
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_utils.h"
+#include "gupcr_barrier.h"
+#include "gupcr_lock.h"
+
+struct upc_heap_list_struct;
+typedef shared struct upc_heap_list_struct *upc_heap_list_p;
+typedef struct upc_heap_list_struct
+{
+  upc_heap_list_p next;
+  upc_heap_list_p prev;
+} upc_heap_list_t;
+typedef upc_heap_list_t upc_heap_pool_t[GUPCR_HEAP_NUM_POOLS];
+typedef struct upc_heap_struct
+{
+  shared void *base;
+  upc_lock_t *lock;
+  size_t size;
+  int is_global;
+  size_t pool_avail;
+  upc_heap_pool_t pool;
+} upc_heap_t;
+typedef shared upc_heap_t *upc_heap_p;
+
+typedef struct upc_heap_node_struct
+{
+  upc_heap_list_t link;		/* Must be first.  */
+  size_t size;
+  int alloc_tag;
+  int is_global;
+} upc_heap_node_t;
+typedef shared upc_heap_node_t *upc_heap_node_p;
+
+static shared void *gupcr_heap_region_base;
+static shared void *gupcr_heap_region_top;
+static size_t gupcr_heap_region_size;
+
+static shared upc_heap_t gupcr_global_heap_info;
+static shared upc_heap_t gupcr_local_heap_info[THREADS];
+static strict shared size_t gupcr_heap_global_hi_water_mark;
+static strict shared size_t gupcr_heap_local_low_water_mark;
+static upc_heap_p gupcr_global_heap;
+static upc_heap_p gupcr_local_heap;
+
+/** Increment a shared pointer, by 'nbytes'.  */
+static inline shared void *
+gupcr_pts_add_offset (shared void *ptr, ptrdiff_t nbytes)
+{
+  return (shared void *) (((shared [] char *) ptr) + nbytes);
+}
+
+/** Return the difference between 'ptr1' and 'ptr2'. Both
+    pointers must be non-NULL and have affinity to the same thread.  */
+static inline ptrdiff_t
+gupcr_pts_diff (shared void *ptr1, shared void *ptr2)
+{
+  return (ptrdiff_t) (((shared [] char *) ptr1) - ((shared [] char *) ptr2));
+}
+
+/** Return the smallest power of 2 that is >= 'v',
+    scaled so that gupcr_log2 of the minimum allocation size is 0.  */
+static inline unsigned int
+gupcr_plog2 (unsigned long long v)
+{
+  return gupcr_log2 (GUPCR_MAX (v, GUPCR_HEAP_ALLOC_MIN)) -
+    GUPCR_HEAP_ALLOC_MIN_BITS;
+}
+
+/** Return TRUE if 'list' is empty.  */
+static inline int
+gupcr_heap_is_empty_list (upc_heap_list_p list)
+{
+  gupcr_assert (list != NULL);
+  return list->next == list;
+}
+
+/** Insert 'node' after 'here' in the double linked free list.  */
+static inline void
+gupcr_heap_list_insert (upc_heap_list_p here, upc_heap_list_p node)
+{
+  upc_heap_list_p next;
+  gupcr_assert (here != NULL);
+  gupcr_assert (node != NULL);
+  next = here->next;
+  gupcr_assert (next != NULL);
+  node->next = next;
+  node->prev = here;
+  next->prev = node;
+  here->next = node;
+}
+
+/** Remove 'node' from its position in doubly-linked free list.  */
+static inline void
+gupcr_heap_list_remove (upc_heap_list_p node)
+{
+  upc_heap_list_p next, prev;
+  gupcr_assert (node != NULL);
+  prev = node->prev;
+  gupcr_assert (prev != NULL);
+  next = node->next;
+  gupcr_assert (next != NULL);
+  prev->next = next;
+  next->prev = prev;
+  node->next = NULL;
+  node->prev = NULL;
+}
+
+/** Pop a node from the front of the free list
+    rooted at the 'p'-th pool in 'heap'.  */
+static inline upc_heap_node_p
+gupcr_heap_list_pop (upc_heap_p heap, unsigned int p)
+{
+  upc_heap_node_p node = NULL;
+  upc_heap_list_p list;
+  gupcr_assert (heap != NULL);
+  gupcr_assert (p < GUPCR_HEAP_NUM_POOLS);
+  list = (upc_heap_list_p) &heap->pool[p];
+  if (!gupcr_heap_is_empty_list (list))
+    {
+      upc_heap_list_p first;
+      first = list->next;
+      gupcr_assert (first != NULL);
+      gupcr_heap_list_remove (first);
+      node = (upc_heap_node_p) first;
+      if (gupcr_heap_is_empty_list (list))
+	heap->pool_avail = gupcr_clear_bit (heap->pool_avail, p);
+    }
+  return node;
+}
+
+/** Push 'node' onto the front of the free list
+    rooted at the 'p'-th pool in 'heap'.  */
+static inline void
+gupcr_heap_list_push (upc_heap_p heap, unsigned int p, upc_heap_node_p node)
+{
+  upc_heap_list_p list;
+  gupcr_assert (heap != NULL);
+  gupcr_assert (node != NULL);
+  gupcr_assert (p < GUPCR_HEAP_NUM_POOLS);
+  list = (upc_heap_list_p) &heap->pool[p];
+  if (gupcr_heap_is_empty_list (list))
+    heap->pool_avail = gupcr_set_bit (heap->pool_avail, p);
+  gupcr_heap_list_insert (list, (upc_heap_list_p) node);
+}
+
+/**
+ * Split 'node' into two nodes each of half the size.
+ *
+ * Push one of the half-sized nodes back onto an appropriate free list.
+ * Return the other half-size node.  Before calling this routine,
+ * 'node' must not be on any free list.
+ */
+static inline upc_heap_node_p
+gupcr_heap_list_split (upc_heap_p heap, upc_heap_node_p node)
+{
+  size_t node_size;
+  size_t half_size;
+  unsigned int is_global;
+  upc_heap_node_p free_half;
+  unsigned int p;
+  gupcr_assert (heap != NULL);
+  gupcr_assert (node != NULL);
+  is_global = heap->is_global;
+  node_size = node->size;
+  half_size = ((size_t) 1 << (gupcr_log2 (node_size) - 1));
+  p = gupcr_plog2 (half_size);
+  if (is_global)
+    free_half = gupcr_pts_add_offset (node, half_size);
+  else
+    {
+      free_half = node;
+      node = gupcr_pts_add_offset (free_half, half_size);
+    }
+  upc_memset (free_half, '\0', GUPCR_HEAP_ALLOC_OVERHEAD);
+  free_half->size = half_size;
+  free_half->is_global = is_global;
+  gupcr_heap_list_push (heap, p, free_half);
+  upc_memset (node, '\0', GUPCR_HEAP_ALLOC_OVERHEAD);
+  node->size = half_size;
+  node->is_global = is_global;
+  return node;
+}
+
+/**
+ * Return the buddy of 'node'.
+ *
+ * The buddy is calculated at binary level 'p' by exclusive or-ing
+ * the p'th bit of the offset of 'node' within the heap.
+ * If there is no buddy for this block, return NULL.
+ */
+static inline upc_heap_node_p
+gupcr_heap_get_buddy (upc_heap_p heap, upc_heap_node_p node)
+{
+  shared void *heap_base;
+  size_t heap_size;
+  ptrdiff_t heap_offset, buddy_offset, max_buddy_offset;
+  unsigned int p;
+  upc_heap_node_p buddy = NULL;
+  gupcr_assert (heap != NULL);
+  gupcr_assert (node != NULL);
+  heap_base = heap->base;
+  heap_size = heap->size;
+  heap_offset = gupcr_pts_diff (node, heap_base);
+  gupcr_assert (heap_offset >= 0);
+  p = gupcr_log2 (node->size);
+  buddy_offset = heap_offset ^ ((ptrdiff_t) 1 << p);
+  max_buddy_offset = (ptrdiff_t) heap_size - GUPCR_HEAP_ALLOC_MIN;
+  if (buddy_offset <= max_buddy_offset)
+    buddy = gupcr_pts_add_offset (heap_base, buddy_offset);
+  return buddy;
+}
+
+/**
+ * Attempt to join the node pointed to by 'node_ref'
+ * to its buddy in 'heap' of log2 size 'p'.
+ *
+ * Return TRUE if successful.  If the buddy node
+ * is the 'left' buddy, update the node pointed
+ * to by 'node_ref' to point to the buddy.
+ */
+static inline unsigned int
+gupcr_heap_list_join (upc_heap_p heap,
+		      unsigned int p, upc_heap_node_p *node_ref)
+{
+  unsigned int joined = 0;
+  upc_heap_node_p buddy, node;
+  gupcr_assert (heap != NULL);
+  gupcr_assert (node_ref);
+  gupcr_assert (p < GUPCR_HEAP_NUM_POOLS);
+  node = *node_ref;
+  gupcr_assert (node != NULL);
+  buddy = gupcr_heap_get_buddy (heap, node);
+  /* The node can be joined with its buddy if:
+     1. The buddy is free.
+     2. The buddy has the same power-of-2 size.  */
+  if (buddy && !buddy->alloc_tag)
+    {
+      unsigned int p_buddy;
+      gupcr_assert (buddy->size > 0);
+      p_buddy = gupcr_plog2 (buddy->size);
+      if (p == p_buddy)
+	{
+	  unsigned int p_above = p + 1;
+	  upc_heap_list_p list;
+	  joined = 1;
+	  gupcr_heap_list_remove ((upc_heap_list_p) buddy);
+	  list = (upc_heap_list_p) &heap->pool[p];
+	  if (gupcr_heap_is_empty_list (list))
+	    heap->pool_avail = gupcr_clear_bit (heap->pool_avail, p);
+	  if (gupcr_pts_diff (buddy, node) < 0)
+	    {
+	      node = buddy;
+	      *node_ref = node;
+	    }
+	  node->alloc_tag = 0;
+	  node->size = ((size_t) 1 << (p_above + GUPCR_HEAP_ALLOC_MIN_BITS));
+	}
+    }
+  return joined;
+}
+
+/**
+ * Initialize the data structure used to manage
+ * operations on 'heap'.
+ *
+ * 'is_global' is TRUE if the heap is a global heap.
+ *
+ * For global  heaps, 'base' points to the bottom of the heap
+ * storage area.  For local heaps, 'base' initially points
+ * to the top of the heap storage area and then grows downward.
+ */
+static inline void
+gupcr_heap_init_info (upc_heap_p heap,
+		      unsigned int is_global, shared void *base)
+{
+  unsigned int p;
+  shared [] upc_heap_list_t *pool;
+  gupcr_assert (heap != NULL);
+  upc_memset (heap, '\0', sizeof (upc_heap_t));
+  gupcr_assert (base != NULL);
+  heap->base = base;
+  heap->is_global = is_global;
+  if (is_global)
+    heap->lock = gupcr_global_heap_lock;
+  else
+    heap->lock = gupcr_local_heap_lock;
+  for (p = 0, pool = &heap->pool[0]; p < GUPCR_HEAP_NUM_POOLS; ++p, ++pool)
+    {
+      pool->next = (upc_heap_list_p) pool;
+      pool->prev = (upc_heap_list_p) pool;
+    }
+  heap->pool_avail = 0;
+}
+
+/**
+ * Initialize the global and local heap data structures.
+ *
+ * 'heap_region_base' is the shared address where the heap should begin,
+ * and 'heap_region_size' is the maximum number of bytes available
+ * for dynamic shared memory allocation.
+ */
+void
+gupcr_alloc_init (shared void *heap_region_base, size_t heap_region_size)
+{
+  shared void *local_heap_base;
+  gupcr_assert (upc_threadof (heap_region_base) == (size_t) MYTHREAD);
+  gupcr_assert (gupcr_is_pow_2 (GUPCR_HEAP_ALLOC_MIN));
+  gupcr_assert ((GUPCR_HEAP_ALLOC_OVERHEAD % 16) == 0);
+  gupcr_assert (GUPCR_HEAP_ALLOC_OVERHEAD >= sizeof (upc_heap_node_t));
+  gupcr_heap_region_base = heap_region_base;
+  gupcr_heap_region_size = heap_region_size;
+  gupcr_heap_region_top =
+    gupcr_pts_add_offset (heap_region_base, heap_region_size);
+  gupcr_global_heap = &gupcr_global_heap_info;
+  gupcr_local_heap = &gupcr_local_heap_info[MYTHREAD];
+  if (!MYTHREAD)
+    {
+      gupcr_heap_global_hi_water_mark = 0;
+      gupcr_heap_local_low_water_mark = heap_region_size;
+      gupcr_heap_init_info (&gupcr_global_heap_info, 1, heap_region_base);
+    }
+  /* The local heap base is initially the top of the UPC heap region,  */
+  local_heap_base = gupcr_heap_region_top;
+  gupcr_heap_init_info (gupcr_local_heap, 0, local_heap_base);
+}
+
+/**
+ * Allocate 'size' bytes from the heap memory region.
+ *
+ * Global allocations raise the high water mark.
+ * Local allocations potentially decrease the low water mark.
+ * Space is available as long as the high water mark
+ * does not cross above the low water mark.
+ *
+ * If successful, return a pointer to the newly allocated space.
+ * Return NULL if there is not enough space.
+ *
+ * The 'size' argument is constrained to be an exact power of 2.
+ *
+ */
+static shared void *
+gupcr_heap_region_alloc (upc_heap_p heap, size_t size)
+{
+  shared void *mem = NULL;
+  unsigned int is_global;
+  size_t heap_size, new_heap_size;
+  shared void *heap_base;
+  unsigned int have_enough_space;
+  gupcr_assert (heap != NULL);
+  gupcr_assert (size > 0);
+  gupcr_assert (gupcr_is_pow_2 (size));
+  is_global = heap->is_global;
+  heap_size = heap->size;
+  heap_base = heap->base;
+  new_heap_size = heap_size + size;
+  have_enough_space = 0;
+  upc_lock (gupcr_heap_region_lock);
+  if (is_global)
+    {
+      size_t new_hi_water_mark;
+      new_hi_water_mark = new_heap_size;
+      if (new_hi_water_mark <= gupcr_heap_local_low_water_mark)
+	{
+	  gupcr_heap_global_hi_water_mark = new_hi_water_mark;
+	  have_enough_space = 1;
+	}
+    }
+  else
+    {
+      if (new_heap_size <= gupcr_heap_region_size)
+	{
+	  size_t new_low_water_mark;
+	  new_low_water_mark = gupcr_heap_region_size - new_heap_size;
+	  if (new_low_water_mark >= gupcr_heap_global_hi_water_mark)
+	    {
+	      if (new_low_water_mark < gupcr_heap_local_low_water_mark)
+		gupcr_heap_local_low_water_mark = new_low_water_mark;
+	      have_enough_space = 1;
+	    }
+	}
+    }
+  upc_unlock (gupcr_heap_region_lock);
+  if (have_enough_space)
+    {
+      ptrdiff_t heap_size_offset;
+      if (is_global)
+	{
+	  heap_size_offset = (ptrdiff_t) heap_size;
+	  mem = gupcr_pts_add_offset (heap_base, heap_size_offset);
+	}
+      else
+	{
+	  heap_size_offset = -((ptrdiff_t) size);
+	  heap_base = gupcr_pts_add_offset (heap_base, heap_size_offset);
+	  heap->base = heap_base;
+	  mem = heap_base;
+	}
+      heap->size = new_heap_size;
+    }
+  return mem;
+}
+
+/**
+ * Repetitively double the size of 'heap' until a free block
+ * of at least 'size' bytes (rounded up to the next power of 2)
+ * is created.
+ */
+static void
+gupcr_heap_extend (upc_heap_p heap, size_t size)
+{
+  size_t heap_size;
+  size_t extend_size;
+  unsigned int is_global;
+  unsigned int p;
+  size_t free_block_size;
+  gupcr_assert (heap != NULL);
+  gupcr_assert (size > 0);
+  heap_size = heap->size;
+  is_global = heap->is_global;
+  extend_size = ((size_t) 1 << gupcr_log2 (size));
+  do
+    {
+      upc_heap_node_p free_block;
+      free_block_size = heap_size ? heap_size : extend_size;
+      free_block = gupcr_heap_region_alloc (heap, free_block_size);
+      if (free_block == NULL)
+	return;
+      upc_memset (free_block, '\0', GUPCR_HEAP_ALLOC_OVERHEAD);
+      free_block->size = free_block_size;
+      free_block->is_global = is_global;
+      p = gupcr_plog2 (free_block_size);
+      gupcr_heap_list_push (heap, p, free_block);
+      heap_size += free_block_size;
+      heap->size = heap_size;
+    }
+  while (free_block_size < extend_size);
+}
+
+/**
+ * Allocate a block of 'size' bytes from 'heap'.
+ */
+static shared void *
+gupcr_heap_alloc (upc_heap_p heap, size_t size)
+{
+  shared void *mem = NULL;
+  const size_t alloc_size = GUPCR_MAX (size + GUPCR_HEAP_ALLOC_OVERHEAD,
+				       GUPCR_HEAP_ALLOC_MIN);
+  const unsigned int pool_fit = gupcr_plog2 (alloc_size);
+  unsigned long long int pool_avail;
+  unsigned int p;
+  upc_heap_node_p alloc = NULL;
+  gupcr_assert (heap != NULL);
+  gupcr_assert (size > 0);
+  upc_lock (heap->lock);
+  pool_avail = heap->pool_avail << pool_fit;
+  if (!pool_avail)
+    {
+      gupcr_heap_extend (heap, alloc_size);
+      pool_avail = heap->pool_avail << pool_fit;
+    }
+  if (pool_avail)
+    {
+      p = pool_fit + gupcr_find_first_one (pool_avail);
+      for (alloc = gupcr_heap_list_pop (heap, p); p > pool_fit; --p)
+	alloc = gupcr_heap_list_split (heap, alloc);
+      alloc->alloc_tag = GUPCR_HEAP_ALLOC_TAG;
+    }
+  if (alloc)
+    mem = gupcr_pts_add_offset (alloc, GUPCR_HEAP_ALLOC_OVERHEAD);
+  upc_unlock (heap->lock);
+  return mem;
+}
+
+/**
+ * Return the block given by 'node' into 'heap'.
+ */
+static void
+gupcr_heap_free (upc_heap_p heap, upc_heap_node_p node)
+{
+  unsigned int p;
+  upc_heap_node_p free_node;
+  gupcr_assert (heap != NULL);
+  gupcr_assert (node != NULL);
+  upc_lock (heap->lock);
+  for (p = gupcr_plog2 (node->size), free_node = node;
+       gupcr_heap_list_join (heap, p, &free_node); ++p) /* loop */ ;
+  free_node->alloc_tag = 0;
+  gupcr_heap_list_push (heap, p, free_node);
+  upc_unlock (heap->lock);
+}
+
+shared void *
+upc_global_alloc (size_t nblocks, size_t nbytes)
+{
+  size_t request_size = GUPCR_ROUND (nblocks, THREADS) * nbytes;
+  size_t alloc_size = request_size / THREADS;
+  shared void *mem = NULL;
+  gupcr_trace (FC_ALLOC, "ALLOC GLOBAL_ALLOC ENTER");
+  if (alloc_size > 0)
+    mem = gupcr_heap_alloc (gupcr_global_heap, alloc_size);
+  gupcr_trace (FC_ALLOC, "ALLOC GLOBAL_ALLOC EXIT %u:0x%lx %lu",
+	       (unsigned) upc_threadof (mem),
+	       (long unsigned) upc_addrfield (mem),
+	       (long unsigned) nbytes);
+  return mem;
+}
+
+shared void *
+upc_all_alloc (size_t nblocks, size_t nbytes)
+{
+  size_t request_size = GUPCR_ROUND (nblocks, THREADS) * nbytes;
+  size_t alloc_size = request_size / THREADS;
+  shared void *mem = NULL;
+  gupcr_trace (FC_ALLOC, "ALLOC ALL_ALLOC ENTER");
+  if (alloc_size > 0)
+    {
+      if (MYTHREAD == 0)
+	{
+	  mem = gupcr_heap_alloc (gupcr_global_heap, alloc_size);
+	  gupcr_bcast_send (&mem, sizeof (mem));
+	}
+      else
+	gupcr_bcast_recv (&mem, sizeof (mem));
+    }
+  gupcr_trace (FC_ALLOC, "ALLOC ALL_ALLOC EXIT %u:0x%lx %lu",
+	       (unsigned) upc_threadof (mem),
+	       (long unsigned) upc_addrfield (mem),
+	       (long unsigned) nbytes);
+  return mem;
+}
+
+shared void *
+upc_alloc (size_t nbytes)
+{
+  shared void *mem = NULL;
+  gupcr_trace (FC_ALLOC, "ALLOC ALLOC ENTER");
+  if (nbytes)
+    mem = gupcr_heap_alloc (gupcr_local_heap, nbytes);
+  gupcr_trace (FC_ALLOC, "ALLOC ALLOC EXIT %u:0x%lx %lu",
+	       (unsigned) upc_threadof (mem),
+	       (long unsigned) upc_addrfield (mem),
+	       (long unsigned) nbytes);
+  return mem;
+}
+
+void
+upc_all_free (shared void *ptr)
+{
+  if (ptr)
+    {
+      const int thread = (int) upc_threadof (ptr);
+      upc_barrier - 1;
+      /* Check for errors only on thread 0.  */
+      if ((MYTHREAD == 0) && (thread >= THREADS))
+	gupcr_error ("upc_all_free() called with invalid shared pointer");
+      if (thread == MYTHREAD)
+	upc_free (ptr);
+    }
+}
+
+void
+upc_free (shared void *ptr)
+{
+  gupcr_trace (FC_ALLOC, "ALLOC FREE ENTER %u:0x%lx",
+	       (unsigned) upc_threadof (ptr),
+	       (long unsigned) upc_addrfield (ptr));
+  if (ptr)
+    {
+      const size_t offset __attribute__ ((unused)) = upc_addrfield (ptr);
+      const int thread = (int) upc_threadof (ptr);
+      const size_t phase = upc_phaseof (ptr);
+      upc_heap_p heap;
+      upc_heap_node_p node;
+      unsigned int is_global;
+      if (phase || thread >= THREADS)
+	gupcr_error ("upc_free() called with invalid shared pointer");
+      node = gupcr_pts_add_offset (ptr, -GUPCR_HEAP_ALLOC_OVERHEAD);
+      is_global = node->is_global;
+      if (is_global && thread)
+	gupcr_error ("upc_free() called with invalid shared pointer");
+      if (node->alloc_tag != GUPCR_HEAP_ALLOC_TAG)
+	gupcr_error ("upc_free() called with pointer to unallocated space");
+      if (is_global)
+	heap = gupcr_global_heap;
+      else
+	heap = &gupcr_local_heap_info[thread];
+      gupcr_heap_free (heap, node);
+    }
+  gupcr_trace (FC_ALLOC, "ALLOC FREE EXIT");
+}
+
+/** @} */
Index: libgupc/portals4/gupcr_atomic.upc
===================================================================
--- libgupc/portals4/gupcr_atomic.upc	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_atomic.upc	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,600 @@
+/* Copyright (C) 2013-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include <upc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <upc_atomic.h>
+#include <portals4.h>
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_atomic_sup.h"
+
+/**
+ * @file gupcr_atomic.upc
+ * GUPC Portals4 UPC atomics implementation.
+ *
+ * All UPC atomic operations and data types, with exception of UPC_PTS,
+ * are almost completely matched to the corresponding Portals4 atomics.
+ * The following exceptions are made:
+ *
+ * UPC_SUB  Converted into Portals4 atomic add of a negative number.
+ * UPC_INC  Converted into Portals4 atomic add of one.
+ * UPC_DEC  Converted into Portals4 atomic add of negative one.
+ *
+ * UPC_PTS data type does not use Portals4 atomic operations (even though
+ * 64 bit pointer-to-shared can fit into the int64 container).  This is
+ * mainly due to the fact that pointer-to-shared comparison has to
+ * disregard the phase part of the pointer and Portals4 does not have
+ * support for CSWAP with a mask.
+ */
+
+/**
+ * @addtogroup ATOMIC GUPCR Atomics Support Functions
+ * @{
+ */
+
+/** Atomic domain representation */
+struct upc_atomicdomain_struct
+{
+  upc_lock_t *lock;
+  upc_op_t ops;
+  upc_type_t type;
+};
+
+/**
+ * Convert UPC to Portals4 atomic data type.
+ *
+ * @param [in] upc_type UPC atomic data type
+ * @retval Portals4 atomic data type
+ */
+static inline ptl_datatype_t
+gupcr_atomic_to_ptl_type (upc_type_t upc_type)
+{
+  switch (upc_type)
+    {
+    case UPC_INT:
+      return UPC_ATOMIC_TO_PTL_INT;
+    case UPC_UINT:
+      return UPC_ATOMIC_TO_PTL_UINT;
+    case UPC_LONG:
+      return UPC_ATOMIC_TO_PTL_LONG;
+    case UPC_ULONG:
+      return UPC_ATOMIC_TO_PTL_ULONG;
+    case UPC_INT32:
+      return UPC_ATOMIC_TO_PTL_INT32;
+    case UPC_UINT32:
+      return UPC_ATOMIC_TO_PTL_UINT32;
+    case UPC_INT64:
+      return UPC_ATOMIC_TO_PTL_INT64;
+    case UPC_UINT64:
+      return UPC_ATOMIC_TO_PTL_UINT64;
+    case UPC_FLOAT:
+      return UPC_ATOMIC_TO_PTL_FLOAT;
+    case UPC_DOUBLE:
+      return UPC_ATOMIC_TO_PTL_DOUBLE;
+    default:
+      gupcr_error ("invalid UPC atomic type %d", (int) upc_type);
+    }
+  return -1;
+}
+
+/**
+ * Convert UPC to Portals4 atomic operation.
+ *
+ * @param [in] upc_op UPC atomic operation
+ * @retval Portals4 atomic operation
+ */
+static inline ptl_op_t
+gupcr_atomic_to_ptl_op (upc_op_t upc_op)
+{
+  switch (upc_op)
+    {
+    case UPC_ADD:
+      return PTL_SUM;
+    case UPC_MULT:
+      return PTL_PROD;
+    case UPC_MAX:
+      return PTL_MAX;
+    case UPC_MIN:
+      return PTL_MIN;
+    case UPC_AND:
+      return PTL_BAND;
+    case UPC_OR:
+      return PTL_BOR;
+    case UPC_XOR:
+      return PTL_BXOR;
+    default:
+      gupcr_error ("invalid UPC atomic op %d", (int) upc_op);
+    }
+  return -1;
+}
+
+/**
+ * Convert UPC atomic operation into a string.
+ *
+ * @param [in] upc_op UPC atomic operation
+ * @retval Character string
+ */
+static const char *
+gupcr_get_atomic_op_as_string (upc_op_t upc_op)
+{
+  switch (upc_op)
+    {
+    case UPC_ADD:
+      return "UPC_ADD";
+    case UPC_AND:
+      return "UPC_AND";
+    case UPC_CSWAP:
+      return "UPC_CSWAP";
+    case UPC_DEC:
+      return "UPC_DEC";
+    case UPC_INC:
+      return "UPC_INC";
+    case UPC_GET:
+      return "UPC_GET";
+    case UPC_MAX:
+      return "UPC_MAX";
+    case UPC_MIN:
+      return "UPC_MIN";
+    case UPC_MULT:
+      return "UPC_MULT";
+    case UPC_OR:
+      return "UPC_OR";
+    case UPC_SET:
+      return "UPC_SET";
+    case UPC_SUB:
+      return "UPC_SUB";
+    case UPC_XOR:
+      return "UPC_XOR";
+    }
+  return "UNKNOWN ATOMIC OP";
+}
+
+/**
+ * Convert UPC atomic type into a string.
+ *
+ * @param [in] upc_type UPC atomic type
+ * @retval Character string
+ */
+static const char *
+gupcr_get_atomic_type_as_string (upc_type_t upc_type)
+{
+  switch (upc_type)
+    {
+    case UPC_INT:
+      return "UPC_INT";
+    case UPC_UINT:
+      return "UPC_UINT";
+    case UPC_LONG:
+      return "UPC_LONG";
+    case UPC_ULONG:
+      return "UPC_ULONG";
+    case UPC_INT32:
+      return "UPC_INT32";
+    case UPC_UINT32:
+      return "UPC_UINT32";
+    case UPC_INT64:
+      return "UPC_INT64";
+    case UPC_UINT64:
+      return "UPC_UINT64";
+    case UPC_FLOAT:
+      return "UPC_FLOAT";
+    case UPC_DOUBLE:
+      return "UPC_DOUBLE";
+    case UPC_PTS:
+      return "UPC_PTS";
+    }
+  return "UNKNOWN ATOMIC TYPE";
+}
+
+/** Set value by UPC atomic type macro */
+#define FUNC_TYPE_SET(__name__,__type__)    \
+	*(__type__ *) buf = (__type__) value
+
+/**
+ * Set buffer to the value of the particular UPC atomic type.
+ *
+ * @param [in] buf Pointer to the buffer to set
+ * @param [in] type UPC atomic type
+ * @param [in] value Value to be set
+ */
+static void
+gupcr_set_optype_val (void *buf, upc_type_t type, int value)
+{
+  switch (type)
+    {
+    case UPC_INT:
+      FUNC_TYPE_SET (UPC_INT, int);
+      break;
+    case UPC_UINT:
+      FUNC_TYPE_SET (UPC_UINT, unsigned int);
+      break;
+    case UPC_LONG:
+      FUNC_TYPE_SET (UPC_LONG, long);
+      break;
+    case UPC_ULONG:
+      FUNC_TYPE_SET (UPC_ULONG, unsigned long);
+      break;
+    case UPC_INT32:
+      FUNC_TYPE_SET (UPC_INT32, int32_t);
+      break;
+    case UPC_UINT32:
+      FUNC_TYPE_SET (UPC_UINT32, uint32_t);
+      break;
+    case UPC_INT64:
+      FUNC_TYPE_SET (UPC_INT64, int64_t);
+      break;
+    case UPC_UINT64:
+      FUNC_TYPE_SET (UPC_UINT64, uint64_t);
+      break;
+    case UPC_FLOAT:
+      FUNC_TYPE_SET (UPC_FLOAT, float);
+      break;
+    case UPC_DOUBLE:
+      FUNC_TYPE_SET (UPC_DOUBLE, double);
+      break;
+    default:
+      gupcr_error ("wrong UPC type (%d)", type);
+    }
+}
+
+/** Negate value by UPC atomic type macro */
+#define FUNC_TYPE_NEGATE(__name__,__type__)    \
+      *(__type__ *) dbuf = - *(__type__*) sbuf
+
+/**
+ * Negate value of the particular UPC atomic type.
+ *
+ * @param [in] dbuf Pointer to negated value
+ * @param [in] sbuf Pointer to original value
+ * @param [in] type UPC atomic type
+ */
+static void
+gupcr_negate_atomic_type (void *dbuf, const void *sbuf, upc_type_t type)
+{
+  switch (type)
+    {
+    case UPC_INT:
+      FUNC_TYPE_NEGATE (UPC_INT, int);
+      break;
+    case UPC_UINT:
+      FUNC_TYPE_NEGATE (UPC_UINT, unsigned int);
+      break;
+    case UPC_LONG:
+      FUNC_TYPE_NEGATE (UPC_LONG, long);
+      break;
+    case UPC_ULONG:
+      FUNC_TYPE_NEGATE (UPC_ULONG, unsigned long);
+      break;
+    case UPC_INT32:
+      FUNC_TYPE_NEGATE (UPC_INT32, int32_t);
+      break;
+    case UPC_UINT32:
+      FUNC_TYPE_NEGATE (UPC_UINT32, uint32_t);
+      break;
+    case UPC_INT64:
+      FUNC_TYPE_NEGATE (UPC_INT64, int64_t);
+      break;
+    case UPC_UINT64:
+      FUNC_TYPE_NEGATE (UPC_UINT64, uint64_t);
+      break;
+    case UPC_FLOAT:
+      FUNC_TYPE_NEGATE (UPC_FLOAT, float);
+      break;
+    case UPC_DOUBLE:
+      FUNC_TYPE_NEGATE (UPC_DOUBLE, double);
+      break;
+    default:
+      gupcr_error ("wrong UPC type (%d)", type);
+    }
+}
+
+/** @} */
+
+/**
+ * @addtogroup UPCATOMIC UPC Atomics Functions
+ * @{
+ */
+
+/**
+ * UPC atomic relaxed operation.
+ *
+ * @param [in] domain Atomic domain
+ * @param [in] fetch_ptr Target of the update
+ * @param [in] op Atomic operation
+ * @param [in] target Target address of the operation
+ * @param [in] operand1 Operation required argument
+ * @param [in] operand2 Operation required argument
+ */
+void
+upc_atomic_relaxed (upc_atomicdomain_t * domain,
+		    void *restrict fetch_ptr, upc_op_t op,
+		    shared void *restrict target,
+		    const void *restrict operand1,
+		    const void *restrict operand2)
+{
+  struct upc_atomicdomain_struct *ldomain;
+  char cvt_buf[GUPC_MAX_ATOMIC_SIZE];
+
+  /* Complete all strict operations.  Portals4 runtime allows only
+     outstanding put operations.  */
+  if (gupcr_pending_strict_put)
+    gupcr_gmem_sync_puts ();
+
+  if (domain == NULL)
+    gupcr_fatal_error ("NULL atomic domain pointer specified");
+
+  ldomain = (struct upc_atomicdomain_struct *) &domain[MYTHREAD];
+
+  gupcr_trace (FC_ATOMIC, "ATOMIC ENTER %s %s",
+	       gupcr_get_atomic_op_as_string (op),
+	       gupcr_get_atomic_type_as_string (ldomain->type));
+
+  if (target == NULL)
+    gupcr_fatal_error ("NULL atomic target pointer specified");
+
+  if (!(op && ldomain->ops))
+    {
+      gupcr_fatal_error ("invalid operation (%s) for specified domain",
+			 gupcr_get_atomic_op_as_string (op));
+    }
+
+  /* Check arguments.  */
+  switch (op)
+    {
+    case UPC_GET:
+      if (fetch_ptr == NULL)
+	gupcr_fatal_error (
+	  "atomic operation (UPC_GET) requires a non-NULL fetch pointer");
+    case UPC_INC:
+    case UPC_DEC:
+      if (operand1 != NULL)
+	gupcr_error ("atomic operation (%s) requires a NULL operand1",
+		     gupcr_get_atomic_op_as_string (op));
+      if (operand2 != NULL)
+	gupcr_error ("atomic operation (%s) requires a NULL operand2",
+		     gupcr_get_atomic_op_as_string (op));
+      break;
+    case UPC_CSWAP:
+      if (operand1 == NULL)
+	gupcr_fatal_error (
+	  "atomic operation (UPC_CSWAP) requires a non-NULL operand1");
+      if (operand2 == NULL)
+	gupcr_fatal_error (
+	  "atomic operation (UPC_CSWAP) requires a non-NULL operand2");
+      break;
+    default:
+      if (operand1 == NULL)
+	gupcr_fatal_error (
+		"atomic operation (%s) requires a non-NULL operand1",
+		gupcr_get_atomic_op_as_string (op));
+      if (operand2 != NULL)
+	gupcr_error ("atomic operation (%s) requires a NULL operand2",
+		     gupcr_get_atomic_op_as_string (op));
+    }
+
+  /* UPC_PTS data type does not use Portals4 atomic operations,
+     even though 64 bit pointer-to-shared fits in the int64
+     container.  UPC_PTS supports only access operations (get, set, cswap)
+     and as pointer compare needs to disregards the phase during
+     comparison we are unable to place the pointer in some integral
+     container (e.g. int64) and use Portals4 atomic ops.  */
+  if (ldomain->type == UPC_PTS)
+    {
+      upc_lock (ldomain->lock);
+      switch (op)
+	{
+	case UPC_GET:
+	  *(shared void **) fetch_ptr = *(shared void *shared *) target;
+	  break;
+	case UPC_SET:
+	  if (fetch_ptr)
+	    *(shared void **) fetch_ptr = *(shared void *shared *) target;
+	  *(shared void *shared *) target = *(shared void **) operand1;
+	  break;
+	case UPC_CSWAP:
+	  {
+	    shared void *tmp = *(shared void *shared *) target;
+	    if (*(shared void **) operand1 == tmp)
+	      *(shared void *shared *) target = *(shared void **) operand2;
+	    if (fetch_ptr)
+	      *(shared void **) fetch_ptr = tmp;
+	  }
+	  break;
+	default:
+	  upc_unlock (ldomain->lock);
+	  gupcr_fatal_error ("invalid atomic operation (%s) for UPC_PTS",
+			      gupcr_get_atomic_op_as_string (op));
+	}
+      upc_unlock (ldomain->lock);
+    }
+  else
+    {
+      size_t dthread = upc_threadof (target);
+      size_t doffset = upc_addrfield (target);
+
+      switch (op)
+	{
+	case UPC_GET:
+	  gupcr_atomic_get (dthread, doffset, fetch_ptr,
+			    gupcr_atomic_to_ptl_type (ldomain->type));
+	  break;
+	case UPC_SET:
+	  gupcr_atomic_set (dthread, doffset, fetch_ptr, operand1,
+			    gupcr_atomic_to_ptl_type (ldomain->type));
+	  break;
+	case UPC_CSWAP:
+	  gupcr_atomic_cswap (dthread, doffset, fetch_ptr, operand1, operand2,
+			      gupcr_atomic_to_ptl_type (ldomain->type));
+	  break;
+	case UPC_AND:
+	case UPC_OR:
+	case UPC_XOR:
+	  if (ldomain->type == UPC_PTS ||
+	      ldomain->type == UPC_FLOAT || ldomain->type == UPC_DOUBLE)
+	    {
+	      gupcr_fatal_error (
+			"invalid atomic operation (%s) for %s type",
+			gupcr_get_atomic_op_as_string (op),
+			gupcr_get_atomic_type_as_string (ldomain->type));
+	    }
+	  gupcr_atomic_op (dthread, doffset, fetch_ptr, operand1,
+			   gupcr_atomic_to_ptl_op (op),
+			   gupcr_atomic_to_ptl_type (ldomain->type));
+	  break;
+	case UPC_ADD:
+	case UPC_MULT:
+	case UPC_MIN:
+	case UPC_MAX:
+	  gupcr_atomic_op (dthread, doffset, fetch_ptr,
+			   operand1, gupcr_atomic_to_ptl_op (op),
+			   gupcr_atomic_to_ptl_type (ldomain->type));
+	  break;
+	case UPC_SUB:
+	  /* As Portals4 does not have atomic subtract, UPC_SUB must be
+	     converted into atomic add, UPC_ADD.  */
+	  gupcr_negate_atomic_type (cvt_buf, operand1, ldomain->type);
+	  gupcr_atomic_op (dthread, doffset, fetch_ptr,
+			   cvt_buf, gupcr_atomic_to_ptl_op (UPC_ADD),
+			   gupcr_atomic_to_ptl_type (ldomain->type));
+	  break;
+	case UPC_INC:
+	case UPC_DEC:
+	  if (op == UPC_INC)
+	    gupcr_set_optype_val (cvt_buf, ldomain->type, 1);
+	  else
+	    gupcr_set_optype_val (cvt_buf, ldomain->type, -1);
+	  gupcr_atomic_op (dthread, doffset, fetch_ptr, cvt_buf, PTL_SUM,
+			   gupcr_atomic_to_ptl_type (ldomain->type));
+	  break;
+	default:
+	  gupcr_fatal_error ("invalid atomic operation: %s",
+			     gupcr_get_atomic_op_as_string (op));
+	}
+    }
+  gupcr_trace (FC_ATOMIC, "ATOMIC EXIT");
+}
+
+/**
+ * UPC atomic strict operation.
+ *
+ * @param [in] domain Atomic domain
+ * @param [in] fetch_ptr Target of the update
+ * @param [in] op Atomic operation
+ * @param [in] target Target address of the operation
+ * @param [in] operand1 Operation required argument
+ * @param [in] operand2 Operation required argument
+ */
+void
+upc_atomic_strict (upc_atomicdomain_t * domain,
+		   void *restrict fetch_ptr,
+		   upc_op_t op,
+		   shared void *restrict target,
+		   const void *restrict operand1,
+		   const void *restrict operand2)
+{
+  upc_fence;
+  upc_atomic_relaxed (domain, fetch_ptr, op, target, operand1, operand2);
+  upc_fence;
+}
+
+/**
+ * Collective allocation of atomic domain.
+ *
+ * Implementation uses native Portals4 atomic functions and the
+ * hint field is ignored.
+ *
+ * @parm [in] type Atomic operation type
+ * @parm [in] ops Atomic domain operations
+ * @parm [in] hints Atomic operation hint
+ * @retval Allocated atomic domain pointer
+ */
+upc_atomicdomain_t *
+upc_all_atomicdomain_alloc (upc_type_t type,
+			    upc_op_t ops,
+			    upc_atomichint_t hints __attribute__ ((unused)))
+{
+  struct upc_atomicdomain_struct *ldomain;
+  shared upc_atomicdomain_t *domain;
+
+  gupcr_trace (FC_ATOMIC, "ATOMIC DOMAIN_ALLOC ENTER %s ops(%X)",
+	       gupcr_get_atomic_type_as_string (type), (unsigned) ops);
+  domain = (upc_atomicdomain_t *)
+    upc_all_alloc (THREADS, sizeof (struct upc_atomicdomain_struct));
+  gupcr_assert (domain != NULL);
+
+  ldomain = (struct upc_atomicdomain_struct *) &domain[MYTHREAD];
+  ldomain->lock = NULL;
+  if (type == UPC_PTS)
+    ldomain->lock = upc_all_lock_alloc ();
+  ldomain->ops = ops;
+  ldomain->type = type;
+  gupcr_trace (FC_ATOMIC, "ATOMIC DOMAIN_ALLOC EXIT");
+  return domain;
+}
+
+/**
+ * Collective free of the atomic domain.
+ *
+ * @param [in] domain Pointer to atomic domain
+ *
+ * @ingroup UPCATOMIC UPC Atomic Functions
+ */
+void
+upc_all_atomicdomain_free (upc_atomicdomain_t * domain)
+{
+  if (domain == NULL)
+    gupcr_fatal_error ("NULL atomic domain pointer specified");
+  upc_barrier;
+  if (MYTHREAD == 0)
+    {
+      upc_lock_free (domain->lock);
+      upc_free (domain);
+    }
+  upc_barrier;
+}
+
+/**
+ * Query implementation for expected performance.
+ *
+ * @parm [in] ops Atomic domain operations
+ * @parm [in] type Atomic operation type
+ * @parm [in] addr Atomic address
+ * @retval Expected performance
+ */
+int
+upc_atomic_isfast (upc_type_t type __attribute__ ((unused)),
+		   upc_op_t ops __attribute__ ((unused)),
+		   shared void *addr __attribute__ ((unused)))
+{
+  if (type == UPC_PTS)
+    return UPC_ATOMIC_PERFORMANCE_NOT_FAST;
+  return UPC_ATOMIC_PERFORMANCE_FAST;
+}
+
+/** @} */
Index: libgupc/portals4/gupcr_atomic_sup.c
===================================================================
--- libgupc/portals4/gupcr_atomic_sup.c	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_atomic_sup.c	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,292 @@
+/* Copyright (C) 2013-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_lib.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_coll_sup.h"
+#include "gupcr_atomic_sup.h"
+
+/**
+ * @file gupcr_atomic_sup.c
+ * GUPC Portals4 atomic support routines.
+ *
+ * @addtogroup ATOMIC GUPCR Atomics Support Functions
+ * @{
+ */
+
+/** Atomic local access MD handle */
+static ptl_handle_md_t gupcr_atomic_md;
+/** Atomic local access MD counting events handle */
+static ptl_handle_ct_t gupcr_atomic_md_ct;
+/** Atomic local access MD event queue handle */
+static ptl_handle_eq_t gupcr_atomic_md_eq;
+/** Atomic number of received ACKs on local md */
+static ptl_size_t gupcr_atomic_md_count;
+
+/** Atomic operations use remote gmem PTE */
+#define GUPCR_PTL_PTE_ATOMIC GUPCR_PTL_PTE_GMEM
+
+/**
+ * Atomic GET operation.
+ *
+ * A simple Portals4 get operation is sufficient for data
+ * types supported by UPC.
+ *
+ * @param[in] thread Destination thread
+ * @param[in] doffset Destination offset
+ * @param[in] fetch_ptr Fetch value pointer
+ * @param[in] type Atomic data type
+ */
+void
+gupcr_atomic_get (size_t dthread, size_t doffset, void *fetch_ptr,
+		  ptl_datatype_t type)
+{
+  ptl_ct_event_t ct;
+  ptl_process_t rpid;
+  char tmpbuf[128] __attribute__ ((unused));
+  size_t size;
+
+  gupcr_debug (FC_ATOMIC, "%lu:0x%lx", dthread, doffset);
+  if (fetch_ptr == NULL)
+    gupcr_error ("UPC_GET fetch pointer is NULL");
+
+  size = gupcr_get_atomic_size (type);
+  rpid.rank = dthread;
+  gupcr_portals_call (PtlGet, (gupcr_atomic_md, (ptl_size_t) fetch_ptr,
+			       size, rpid, GUPCR_PTL_PTE_ATOMIC,
+			       PTL_NO_MATCH_BITS, doffset,
+			       PTL_NULL_USER_PTR));
+  gupcr_atomic_md_count += 1;
+  gupcr_portals_call (PtlCTWait,
+		      (gupcr_atomic_md_ct, gupcr_atomic_md_count, &ct));
+  if (ct.failure)
+    {
+      gupcr_process_fail_events (gupcr_atomic_md_eq);
+      gupcr_fatal_error ("received an error on atomic MD");
+    }
+  gupcr_debug (FC_ATOMIC, "ov(%s)",
+	       gupcr_get_buf_as_hex (tmpbuf, fetch_ptr, size));
+}
+
+/**
+ * Portals4 atomic set operation.
+ *
+ * Execute Portals4 PtlSwap with PTL_SWAP operation.
+ *
+ * @param[in] thread Destination thread
+ * @param[in] doffset Destination offset
+ * @param[in] fetch_ptr Fetch value pointer (optional)
+ * @param[in] value New value of atomic variable
+ * @param[in] type Atomic data type
+ */
+void
+gupcr_atomic_set (size_t dthread, size_t doffset, void *fetch_ptr,
+		  const void *value, ptl_datatype_t type)
+{
+  ptl_process_t rpid;
+  ptl_ct_event_t ct;
+  char tmpbuf[128] __attribute__ ((unused));
+  char atomic_tmp_buf[GUPC_MAX_ATOMIC_SIZE];
+  size_t size = gupcr_get_atomic_size (type);
+  gupcr_debug (FC_ATOMIC, "%lu:0x%lx v(%s)", dthread, doffset,
+	       gupcr_get_buf_as_hex (tmpbuf, value, size));
+  rpid.rank = dthread;
+  gupcr_portals_call (PtlSwap, (gupcr_atomic_md,
+				(ptl_size_t) atomic_tmp_buf,
+				gupcr_atomic_md, (ptl_size_t) value,
+				size, rpid, GUPCR_PTL_PTE_ATOMIC,
+				PTL_NO_MATCH_BITS, doffset, PTL_NULL_USER_PTR,
+				PTL_NULL_HDR_DATA, NULL, PTL_SWAP, type));
+  gupcr_atomic_md_count += 1;
+  gupcr_portals_call (PtlCTWait,
+		      (gupcr_atomic_md_ct, gupcr_atomic_md_count, &ct));
+  if (ct.failure)
+    {
+      gupcr_process_fail_events (gupcr_atomic_md_eq);
+      gupcr_fatal_error ("received an error on atomic MD");
+    }
+  if (fetch_ptr)
+    {
+      gupcr_debug (FC_ATOMIC, "ov(%s)",
+		   gupcr_get_buf_as_hex (tmpbuf, atomic_tmp_buf, size));
+      memcpy (fetch_ptr, atomic_tmp_buf, size);
+    }
+}
+
+/**
+ * Portals4 atomic CSWAP operation.
+ *
+ * Execute Portals4 PtlSwap with PTL_CSWAP operation.
+ *
+ * @param[in] thread Destination thread
+ * @param[in] doffset Destination offset
+ * @param[in] fetch_ptr Fetch value pointer (optional)
+ * @param[in] expected Expected value of atomic variable
+ * @param[in] value New value of atomic variable
+ * @param[in] type Atomic data type
+ */
+void
+gupcr_atomic_cswap (size_t dthread, size_t doffset, void *fetch_ptr,
+		    const void *expected, const void *value,
+		    ptl_datatype_t type)
+{
+  ptl_process_t rpid;
+  ptl_ct_event_t ct;
+  char tmpbuf[128] __attribute__ ((unused));
+  char atomic_tmp_buf[GUPC_MAX_ATOMIC_SIZE];
+  size_t size = gupcr_get_atomic_size (type);
+  gupcr_debug (FC_ATOMIC, "%lu:0x%lx v(%s) e(%s)", dthread, doffset,
+	       gupcr_get_buf_as_hex (tmpbuf, value, size),
+	       gupcr_get_buf_as_hex (tmpbuf, expected, size));
+  rpid.rank = dthread;
+  gupcr_portals_call (PtlSwap, (gupcr_atomic_md,
+				(ptl_size_t) atomic_tmp_buf,
+				gupcr_atomic_md, (ptl_size_t) value,
+				size, rpid,
+				GUPCR_PTL_PTE_ATOMIC, PTL_NO_MATCH_BITS,
+				doffset, PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA,
+				expected, PTL_CSWAP, type));
+  gupcr_atomic_md_count += 1;
+  gupcr_portals_call (PtlCTWait,
+		      (gupcr_atomic_md_ct, gupcr_atomic_md_count, &ct));
+  if (ct.failure)
+    {
+      gupcr_process_fail_events (gupcr_atomic_md_eq);
+      gupcr_fatal_error ("received an error on atomic MD");
+    }
+  if (fetch_ptr)
+    {
+      gupcr_debug (FC_ATOMIC, "ov(%s)",
+		   gupcr_get_buf_as_hex (tmpbuf, atomic_tmp_buf, size));
+      memcpy (fetch_ptr, atomic_tmp_buf, size);
+    }
+}
+
+/**
+ * Portals4 atomic operation.
+ *
+ * Execute Portals4 atomic function and return the old value
+ * if requested.
+ * @param[in] thread Destination thread
+ * @param[in] doffset Destination offset
+ * @param[in] fetch_ptr Fetch value pointer (optional)
+ * @param[in] value Atomic value for the operation
+ * @param[in] op Atomic operation
+ * @param[in] type Atomic data type
+ */
+void
+gupcr_atomic_op (size_t dthread, size_t doffset, void *fetch_ptr,
+		 const void *value, ptl_op_t op, ptl_datatype_t type)
+{
+  ptl_process_t rpid;
+  ptl_ct_event_t ct;
+  char tmpbuf[128] __attribute__ ((unused));
+  char atomic_tmp_buf[GUPC_MAX_ATOMIC_SIZE];
+  size_t size = gupcr_get_atomic_size (type);
+  gupcr_debug (FC_ATOMIC, "%lu:0x%lx %s:%s v(%s)", dthread, doffset,
+	       gupcr_strptlop (op), gupcr_strptldatatype (type),
+	       gupcr_get_buf_as_hex (tmpbuf, value, size));
+  rpid.rank = dthread;
+  if (fetch_ptr)
+    {
+      gupcr_portals_call (PtlFetchAtomic,
+			  (gupcr_atomic_md, (ptl_size_t) atomic_tmp_buf,
+			   gupcr_atomic_md, (ptl_size_t) value,
+			   size, rpid, GUPCR_PTL_PTE_ATOMIC,
+			   PTL_NO_MATCH_BITS, doffset,
+			   PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA, op, type));
+    }
+  else
+    {
+      gupcr_portals_call (PtlAtomic,
+			  (gupcr_atomic_md, (ptl_size_t) value,
+			   size, PTL_ACK_REQ, rpid, GUPCR_PTL_PTE_ATOMIC,
+			   PTL_NO_MATCH_BITS, doffset,
+			   PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA, op, type));
+    }
+  gupcr_atomic_md_count += 1;
+  gupcr_portals_call (PtlCTWait,
+		      (gupcr_atomic_md_ct, gupcr_atomic_md_count, &ct));
+  if (ct.failure)
+    {
+      gupcr_process_fail_events (gupcr_atomic_md_eq);
+      gupcr_fatal_error ("received an error on atomic MD");
+    }
+  if (fetch_ptr)
+    {
+      gupcr_debug (FC_ATOMIC, "ov(%s)",
+		   gupcr_get_buf_as_hex (tmpbuf, atomic_tmp_buf, size));
+      memcpy (fetch_ptr, atomic_tmp_buf, size);
+    }
+}
+
+/**
+ * Initialize atomics resources.
+ * @ingroup INIT
+ */
+void
+gupcr_atomic_init (void)
+{
+  ptl_md_t md;
+
+  gupcr_log (FC_ATOMIC, "atomic init called");
+
+  /* Setup the Portals MD for local source/destination copying.
+     We need to map the whole user's space (same as gmem).  */
+  gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_atomic_md_ct));
+  gupcr_portals_call (PtlEQAlloc, (gupcr_ptl_ni, 1, &gupcr_atomic_md_eq));
+  md.length = (ptl_size_t) USER_PROG_MEM_SIZE;
+  md.start = (void *) USER_PROG_MEM_START;
+  md.options = PTL_MD_EVENT_CT_ACK | PTL_MD_EVENT_CT_REPLY |
+    PTL_MD_EVENT_SUCCESS_DISABLE;
+  md.eq_handle = gupcr_atomic_md_eq;
+  md.ct_handle = gupcr_atomic_md_ct;
+  gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_atomic_md));
+
+  /* Reset number of acknowledgments.  */
+  gupcr_atomic_md_count = 0;
+}
+
+/**
+ * Release atomics resources.
+ * @ingroup INIT
+ */
+void
+gupcr_atomic_fini (void)
+{
+  gupcr_log (FC_ATOMIC, "atomic fini called");
+  /* Release atomic MD and its resources.  */
+  gupcr_portals_call (PtlMDRelease, (gupcr_atomic_md));
+  gupcr_portals_call (PtlCTFree, (gupcr_atomic_md_ct));
+  gupcr_portals_call (PtlEQFree, (gupcr_atomic_md_eq));
+}
+
+/** @} */
Index: libgupc/portals4/gupcr_atomic_sup.h
===================================================================
--- libgupc/portals4/gupcr_atomic_sup.h	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_atomic_sup.h	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,84 @@
+/* Copyright (C) 2013-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef _GUPCR_ATOMIC_SUP_H_
+#define _GUPCR_ATOMIC_SUP_H_ 1
+
+/**
+ * @file gupcr_atomic_sup.h
+ * GUPC Portals4 atomics implementation support routines.
+ *
+ * @addtogroup ATOMIC GUPCR Atomics Support Functions
+ * @{
+ */
+
+/** Maximum size of atomic types */
+#define GUPC_MAX_ATOMIC_SIZE 16
+
+/** Convert from UPC atomics int to Portals atomic type */
+#if __SIZEOF_INT__ == 4
+#define UPC_ATOMIC_TO_PTL_INT PTL_INT32_T
+#define UPC_ATOMIC_TO_PTL_UINT PTL_UINT32_T
+#elif __SIZEOF_INT__ == 8
+#define UPC_ATOMIC_TO_PTL_INT PTL_INT64_T
+#define UPC_ATOMIC_TO_PTL_UINT PTL_UINT64_T
+#else
+#error "Size of int not supported"
+#endif
+/** Convert from UPC atomics long to Portals atomic type */
+#if __SIZEOF_LONG__ == 4
+#define UPC_ATOMIC_TO_PTL_LONG PTL_INT32_T
+#define UPC_ATOMIC_TO_PTL_ULONG PTL_UINT32_T
+#elif __SIZEOF_LONG__ == 8
+#define UPC_ATOMIC_TO_PTL_LONG PTL_INT64_T
+#define UPC_ATOMIC_TO_PTL_ULONG PTL_UINT64_T
+#else
+#error "Size of long not supported"
+#endif
+/** Convert from UPC atomic int32 to Portals atomic type */
+#define UPC_ATOMIC_TO_PTL_INT32 PTL_INT32_T
+#define UPC_ATOMIC_TO_PTL_UINT32 PTL_UINT32_T
+/** Convert from UPC atomic int64 to Portals atomic type */
+#define UPC_ATOMIC_TO_PTL_INT64 PTL_INT64_T
+#define UPC_ATOMIC_TO_PTL_UINT64 PTL_UINT64_T
+/** Convert from UPC atomic float to Portals atomic type */
+#define UPC_ATOMIC_TO_PTL_FLOAT PTL_FLOAT
+/** Convert from UPC atomic double to Portals atomic type */
+#define UPC_ATOMIC_TO_PTL_DOUBLE PTL_DOUBLE
+
+/** @} */
+
+void gupcr_atomic_put (size_t, size_t, size_t, ptl_op_t op, ptl_datatype_t);
+void gupcr_atomic_get (size_t, size_t, void *, ptl_datatype_t);
+void gupcr_atomic_set (size_t, size_t, void *, const void *, ptl_datatype_t);
+void gupcr_atomic_cswap (size_t, size_t, void *, const void *,
+			 const void *, ptl_datatype_t);
+void gupcr_atomic_op (size_t, size_t, void *, const void *,
+		      ptl_op_t, ptl_datatype_t);
+void gupcr_atomic_init (void);
+void gupcr_atomic_fini (void);
+
+#endif /* gupcr_atomic_sup.h */
Index: libgupc/portals4/gupcr_backtrace.c
===================================================================
--- libgupc/portals4/gupcr_backtrace.c	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_backtrace.c	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,400 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_sup.h"
+#include "gupcr_utils.h"
+#include "gupcr_backtrace.h"
+#include "gupcr_barrier.h"
+#include <signal.h>
+#include <string.h>
+#if HAVE_EXECINFO_H
+#include <execinfo.h>
+#endif
+#if HAVE_LIMITS_H
+#include <limits.h>
+#endif
+
+/** Skip over frames belonging to the backtrace code itself.  */
+#define GUPCR_BT_SKIP_FRAME_CNT 3
+/** Maximum number of stack frames to display.  */
+#define GUPCR_BT_DEPTH_CNT 128
+
+#ifndef PATH_MAX
+#define PATH_MAX 1024
+#endif
+
+/** Default backtrace file name prefix.  */
+#define UPC_BACKTRACE_PREFIX "backtrace"
+
+/** Full path of the executable program.  */
+static char *gupcr_abs_execname;
+
+/** Backtrace on faults enabled flag.  */
+static int bt_enabled = 0;
+
+/** 
+ * GLIBC backtrace.
+ *
+ * Show backtrace by using the GLIBC backtrace functionality.
+ * Backtrace is improved with the source file/line numbers if
+ * addr2line is available.
+ *
+ * By default backtrace lines are sent to the 'stderr' file
+ * descriptor.  However, an environment variable
+ * UPC_BACKTRACEFILE can be used to redirect the backtrace
+ * to an actual file and it is used as a simple prefix for
+ * the backtrace file. For example, if it is set to "/tmp/trace-upc",
+ * the actual trace file is going to be "/tmp/trace-upc-PID.MYTHREAD".
+ * If empty environment variable is provided, a simple "trace" prefix
+ * is used.
+ *
+ */
+void
+gupcr_backtrace (void)
+{
+  void *strace[GUPCR_BT_DEPTH_CNT];
+  size_t size,i;
+  char **strace_str;
+  char *file_env;
+  int under_upc_main = 1;
+  FILE *traceout = stderr;
+
+  file_env = getenv (GUPCR_BACKTRACE_FILE_ENV);
+  if (file_env)
+    {
+      #define MAX_INT_STRING ".2147483647"
+      char *tracefile;
+      int len, lenw;
+      /* Use default trace file name if one not specified by the user.  */
+      if (!strlen (file_env))
+	file_env = (char *) UPC_BACKTRACE_PREFIX;
+      len = strlen (file_env) + strlen (MAX_INT_STRING) + 1;
+      tracefile = malloc (len);
+      if (!tracefile)
+        gupcr_fatal_error ("cannot allocate (%d) memory for backtrace file %s",
+		     len, file_env);
+      lenw = snprintf (tracefile, len, "%s.%d", file_env, MYTHREAD);
+      if ((lenw >= len) || (lenw < 0))
+	gupcr_fatal_error ("cannot create backtrace file name: %s", file_env);
+      traceout = fopen (tracefile, "w");
+      if (!traceout)
+	gupcr_fatal_error ("cannot open backtrace file: %s", tracefile);
+      free (tracefile);
+    }
+  else
+    fprintf (traceout, "Thread %d backtrace:\n", MYTHREAD);
+
+  /* Use "backtrace" functionality of glibc to receive
+     backtrace addresses.  */
+  size = backtrace (strace, GUPCR_BT_DEPTH_CNT);
+  /* Add symbolic information to each address
+     and print the stack trace.  */
+  for (i = GUPCR_BT_SKIP_FRAME_CNT; i < size; i++)
+    {
+      if (under_upc_main)
+        {
+# if HAVE_UPC_BACKTRACE_ADDR2LINE
+	  /* Call addr2line to generate source files, line numbers,
+	     and functions.  In case of any error (malloc, snprintf)
+	     do not abort the program.  */
+	  FILE *a2l;
+	  #define CMD_TMPL "%s -f -e %s %p"
+	  /* Allow space for addr2line, filename, command line options,
+	     and address argument for addr2line.  */
+	  int cmd_size = strlen (GUPCR_BACKTRACE_ADDR2LINE) +
+			 strlen (gupcr_abs_execname) +
+			 strlen (CMD_TMPL) +
+			 strlen ("0x1234567812345678");
+	  int sz;
+	  char *cmd = malloc (cmd_size);
+	  /* Create an actual addr2line command.  */
+	  sz = snprintf (cmd, cmd_size, CMD_TMPL, GUPCR_BACKTRACE_ADDR2LINE,
+			 gupcr_abs_execname, strace[i]);
+	  if ((sz >= cmd_size) || (sz < 0))
+	    {
+	      fprintf (traceout, "unable to create addr2line "
+				 "command line\n");
+	      return;
+	    }
+	  /* Execute addr2line.  */
+	  a2l = popen (cmd, "r");
+	  free (cmd);
+	  if (a2l)
+	    {
+	      /* addr2line responds with two lines: procedure name and
+		 the file name with line number.  */
+	      int max_rep = 2 * FILENAME_MAX;
+	      /* Build a data structure that is identical to the
+		 structure returned by the glibc backtrace_symbol().  */
+	      struct back_trace {
+		char *addr;
+	        char data[1];
+	      };
+	      struct back_trace *rep = malloc (max_rep);
+	      int index = 0;
+	      if (!rep)
+		{
+		  fprintf (traceout, "unable to acquire memory "
+				     "for backtracing\n");
+		  return;
+		}
+	      rep->data[0] = '\0';
+	      /* Read addr2line response.  */
+	      while (fgets(&rep->data[index], max_rep-index, a2l))
+		{
+		  /* Remove all the new lines, as addr2line returns
+		     info in multiple lines.  */
+		  index = strlen (&rep->data[0]);
+		  if (rep->data[index - 1] == '\n')
+		    rep->data[index - 1] = ' ';
+		}
+	      pclose (a2l);
+	      rep->addr = &rep->data[0];
+	      strace_str = &rep->addr;
+	    }
+	  else
+	    {
+	      /* Somehow we failed to invoke addr2line, fall back
+	         to glibc.  */
+	      strace_str = backtrace_symbols (&strace[i], 1);
+	    }
+# else
+	  strace_str = backtrace_symbols (&strace[i], 1);
+# endif
+	  fprintf (traceout, "[%4d][%lld] %s\n", MYTHREAD, 
+	      (long long int) (i - GUPCR_BT_SKIP_FRAME_CNT), *strace_str);
+	  /* Extra info for the barrier. */
+	  if (strstr( *strace_str, "__upc_wait"))
+	    {
+	      fprintf (traceout, "[%4d]       BARRIER ID: %d\n", MYTHREAD, 
+		       gupcr_barrier_id);
+	    }
+          if (strstr (*strace_str, "upc_main"))
+	    under_upc_main = 0;
+	  /* Symbol trace buffer must be released.  */
+	  free (strace_str);
+	}
+    }
+  fflush (traceout);
+  if (file_env)
+    fclose (traceout);
+}
+
+#define GUPCR_BACKTRACE_PID_BUFLEN 16
+
+/**
+ * Backtrace on fatal errors.
+ *
+ * Print backtrace (stack frames) on fatal errors: run-time
+ * fatal error or segmentation fault. 
+ *
+ * Only print backtrace if environment variable UPC_BACKTRACE
+ * is set to 1. The following order of backtrace capabilities
+ * is searched and executed:
+ *
+ * (1) Use GDB for backtrace (if enabled)
+ * (2) Use GLIBC backtrace with source file/line display (if
+ *     addr2line is available)
+ * (3) Use GLIBC backtrace with raw addresses (display is 
+ *     improved if -rdynamic option is supported by the linker)
+ *
+ */
+void
+gupcr_fatal_error_backtrace (void)
+{
+  if (bt_enabled)
+    {
+#ifdef HAVE_UPC_BACKTRACE_GDB
+  	{
+	  char *env;
+	  const char *gdb;
+          char pid_buf[GUPCR_BACKTRACE_PID_BUFLEN];
+          int child_pid;
+          /* Which gdb to use? */
+          env = getenv (GUPCR_BACKTRACE_GDB_ENV);
+          if (!env || (strlen (env) == 0))
+              gdb = GUPCR_BACKTRACE_GDB;
+	  else
+              gdb = (const char *) env;
+	  if (strcmp (gdb, "none"))
+ 	    {
+	      const char *err_msg = 0;
+	      char tmpf[PATH_MAX];
+	      int fbt;
+	      const char *btcmd = "backtrace 30\n";
+              fprintf (stderr, "Thread %d GDB backtrace:\n", MYTHREAD);
+	      /* Get pid and name of the running program. */
+              sprintf(pid_buf, "%d", getpid());
+	      /* Create temp file for GDB commands. */
+	      if ((fbt = gupcr_create_temp_file 
+			 ("upc_bt_gdb.XXXXXX", tmpf, &err_msg)) == -1)
+	   	{
+		  fprintf (stderr, "cannot open gdb command - %s\n", err_msg);
+		  return;
+		}
+	      if (write (fbt, btcmd, sizeof (btcmd)) == -1)
+ 		{
+		  perror ("cannot write gdb command file for backtrace");
+		  return;
+		}
+	      if (close (fbt))
+ 		{
+		  perror ("cannot close gdb command file for backtrace");
+		  return;
+		}
+              child_pid = fork();
+              if (!child_pid)
+		{
+		  dup2(2,1);
+		  execlp(gdb, gdb, "-nx", "-batch", "-x", tmpf, 
+		         gupcr_abs_execname, pid_buf, NULL);
+		  fprintf (stderr, "cannot start GDB - %s\n", gdb);
+		  abort(); /* If gdb failed to start */
+		}
+	      else
+		waitpid(child_pid,NULL,0);
+	      unlink (tmpf);
+              return;
+	    }
+        }
+#endif /* GUPCR_BACKTRACE_GDB */
+
+       /* Simple backtrace only. */
+       gupcr_backtrace ();
+    }
+}
+
+/**
+ * Backtrace signal handler.
+ *
+ * Display stack frames on a request. In case of the
+ * monitor thread only print the mappings between the 
+ * UPC threads and processes.
+ */
+static void
+gupcr_backtrace_handler (int sig __attribute__ ((unused)),
+			 siginfo_t *siginfo __attribute__ ((unused)),
+			 void *context __attribute__ ((unused)))
+{
+  gupcr_backtrace ();
+}
+
+/**
+ * Backtrace fault handler.
+ *
+ * A fault happened and backtrace is enabled. Allow for only
+ * one thread to print the backtrace. The restore signal
+ * handlers to their default and return ensures that 
+ * signal terminates the thread and allows for the monitor
+ * thread to terminate all the other threads..
+ */
+static void
+gupcr_fault_handler (int sig __attribute__ ((unused)),
+	  	     siginfo_t *siginfo __attribute__ ((unused)),
+		     void *context __attribute__ ((unused)))
+{
+  gupcr_backtrace_restore_handlers ();
+  gupcr_fatal_error_backtrace ();
+}
+
+/**
+ * Initialize UPC backtrace.
+ */
+void
+gupcr_backtrace_init (const char *execname)
+{
+  /* Find the full path for the executable. On linux systems we
+     might be able to read "/proc/self/exe" to the get the full
+     executable path. But, it is not portable. */
+  int slen = sizeof (gupcr_abs_execname) - strlen (execname) - 2;
+  gupcr_abs_execname = malloc (PATH_MAX + 1);
+  if (!gupcr_abs_execname)
+    gupcr_fatal_error ("cannot allocate space for executable file name");
+  *gupcr_abs_execname = '\0';
+  if (execname[0] != '/')
+    {
+      if (!getcwd (gupcr_abs_execname, slen))
+        strcpy (gupcr_abs_execname, "/BT_CANNOT_CREATE_ABS_PATH");
+      strcat (gupcr_abs_execname, "/");
+    }
+  strcat (gupcr_abs_execname, execname);
+
+#ifdef HAVE_UPC_BACKTRACE_SIGNAL
+  {
+    /* Install backtrace signal handler (backtrace on request). */
+    struct sigaction act;
+    memset (&act, '\0', sizeof(act));
+    act.sa_sigaction = &gupcr_backtrace_handler;
+    act.sa_flags = SA_SIGINFO;
+    if (sigaction(GUPCR_BACKTRACE_SIGNAL, &act, NULL) < 0) {
+      perror ("was not able to install backtrace handler");
+    }
+  }
+#endif
+
+  /* Install signal handlers only if backtrace is enabled.  */
+  bt_enabled = gupcr_is_backtrace_enabled ();
+  
+  if (bt_enabled)
+    {
+      struct sigaction act;
+      memset (&act, '\0', sizeof(act));
+      act.sa_sigaction = &gupcr_fault_handler;
+      act.sa_flags = SA_SIGINFO;
+      if (sigaction(SIGABRT, &act, NULL) < 0)
+        perror ("unable to install SIGABRT handler");
+      if (sigaction(SIGILL, &act, NULL) < 0)
+        perror ("unable to install SIGILL handler");
+      if (sigaction(SIGSEGV, &act, NULL) < 0)
+        perror ("unable to install SIGSEGV handler");
+      if (sigaction(SIGBUS, &act, NULL) < 0)
+        perror ("unable to install SIGBUS handler");
+      if (sigaction(SIGFPE, &act, NULL) < 0)
+        perror ("unable to install SIGFPE handler");
+    }
+}
+
+/**
+ * Restore default handlers.
+ *
+ * Has to be called once the run-time discovered
+ * a fatal error.
+ */ 
+void
+gupcr_backtrace_restore_handlers (void)
+{
+  /* Don't handle any signals with backtrace code. Install
+     default handlers.  */
+  signal (SIGABRT, SIG_DFL);
+  signal (SIGILL, SIG_DFL);
+  signal (SIGSEGV, SIG_DFL);
+  signal (SIGBUS, SIG_DFL);
+  signal (SIGFPE, SIG_DFL);
+}
Index: libgupc/portals4/gupcr_backtrace.h
===================================================================
--- libgupc/portals4/gupcr_backtrace.h	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_backtrace.h	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,45 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+#ifndef GUPCR_BACKTRACE_H_
+#define GUPCR_BACKTRACE_H_
+
+/* Environment variables. */
+/** Enable/Disable backtrace env variable. */
+#define GUPCR_BACKTRACE_ENV "UPC_BACKTRACE"
+/** Enable/Disable STAT backtrace env variable. */
+#define GUPCR_BACKTRACE_FILE_ENV "UPC_BACKTRACEFILE"
+/** GDB command for backtrace env variable. */
+#define GUPCR_BACKTRACE_GDB_ENV "UPC_BACKTRACE_GDB"
+
+/* Interfaces. */
+extern void gupcr_backtrace (void);
+extern void gupcr_fatal_backtrace (void);
+extern void gupcr_backtrace_init (const char *execname);
+extern void gupcr_backtrace_restore_handlers (void);
+
+#endif /* gupc_backtrace.h */
Index: libgupc/portals4/gupcr_barrier.c
===================================================================
--- libgupc/portals4/gupcr_barrier.c	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_barrier.c	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,1003 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/**
+ * @file gupcr_barrier.c
+ * GUPC Portals4 barrier implementation.
+ *
+ * The UPC barrier synchronization statements are:
+ *  - upc_notify <i>expression</i>
+ *  - upc_wait <i>expression</i>
+ *  - upc_barrier <i>expression</i>
+ *
+ * The upc barrier statement is equivalent to the compound statement:
+ *   <i>{ upc_notify barrier_value; upc_wait barrier_value; }</i>
+ *
+ * Important rules:
+ *  - Each thread executes an alternating sequence of upc_notify and upc_wait
+ *    statements.
+ *  - A thread completes a <i>upc_wait</i> statement when all threads
+ *    have executed a matching <i>upc_notify</i> statement.
+ *  - <i>upc_notify</i> and <i>upc_wait</i> are collective operations and
+ *    <i>expression</i> (if available) must match across all threads.
+ *  - An empty <i>expression</i> matches any barrier ID.
+ *
+ * The GUPC runtime barrier implementation uses an "all reduce"
+ * algorithm as outlined in the paper <i>Enabling Flexible Collective
+ * Communication Offload with Triggered Operations</i> by Keith Underwood
+ * et al. January, 2007.  Portals atomic operations and triggered
+ * atomic operations are used to propagate and verify
+ * that all UPC threads have entered the same synchronization phase
+ * with matching barrier IDs.
+ *
+ * For the purposes of implementing GUPC barriers, all UPC threads
+ * in a given job are organized as a tree.  Thread 0 is the
+ * root thread (at the top of the tree). Other threads can be
+ * either an inner thread (has at least one child), or a leaf
+ * thread (has no children).
+ *
+ * A UPC barrier is implemented in two distinctive steps: notify and wait.
+ *
+ * A notify step uses the GUPCR_PTL_PTE_BARRIER_UP PTE to pass
+ * its barrier ID to the parent.  The result of an atomic PTL_MIN
+ * operation among children and their parent is passed to the
+ * parent's parent until thread 0 is reached.
+ *
+ * A wait step uses the GUPCR_PTL_PTE_BARRIER_DOWN PTE to pass
+ * the derived consensus barrier ID to all threads.  An error
+ * is raised if the derived ID does not match the thread's barrier ID.
+ *
+ * This implementation supports a split phase barrier where a given
+ * thread completes its wait statement once all other threads
+ * have reached their matching notify statement.
+ *
+ * Each thread uses the following resources:
+ *
+ *   - PTEs (and LEs) for passing barrier IDs UP and DOWN the tree
+ *   - MDs for sending a thread's barrier ID to parents and children
+ *   - Counting events for LEs and MDs
+ *   - Event queues for failure events on LEs and MDs
+ *
+ * Extensive use of Portals triggered functions allow for the efficient
+ * implementation of a split phase barrier.
+ *
+ * @addtogroup BARRIER GUPCR Barrier Functions
+ * @{
+ */
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_sup.h"
+#include "gupcr_sync.h"
+#include "gupcr_broadcast.h"
+#include "gupcr_portals.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+
+/** Per-thread flag set by upc_notify() and cleared by upc_wait() */
+static int gupcr_barrier_active = 0;
+
+/** Max barrier ID used by the barrier implementation.
+ * The Portals PTL_MIN atomic function is used by
+ * each thread to report its barrier ID to its parents.
+ * The MAX barrier ID value is used to initialize the memory
+ * location targeted by PTL_MIN function.
+ */
+#define BARRIER_ID_MAX INT_MAX
+/** Anonymous barrier ID used by the barrier implementation.
+ * This barrier ID is used for barrier statements that do not
+ * specify a barrier ID and it matches any other barrier ID.
+ */
+#define BARRIER_ANONYMOUS INT_MIN
+/** Size of the barrier ID */
+#define BARRIER_ID_SIZE (sizeof (gupcr_barrier_value))
+
+/** Leaf thread check */
+#define LEAF_THREAD  ((THREADS != 1) && (gupcr_child_cnt == 0))
+/** Root thread check */
+#define ROOT_THREAD  (gupcr_parent_thread == -1)
+/** Inner thread check */
+#define INNER_THREAD ((gupcr_child_cnt != 0) && (gupcr_parent_thread != -1))
+
+/** Thread's current barrier ID */
+int gupcr_barrier_id;
+
+/** Memory storage for notify barrier ID.  Mapped by
+    LE for external access, and MD for internal access.  */
+
+static int gupcr_notify_value;
+/** Barrier notify LE handle (appended to GUPCR_PTL_PTE_BARRIER_UP) */
+static ptl_handle_le_t gupcr_notify_le;
+/** Barrier notify LE EQ handle */
+static ptl_handle_eq_t gupcr_notify_le_eq;
+/** Barrier notify LE CT handle */
+static ptl_handle_ct_t gupcr_notify_le_ct;
+/** Barrier notify LE CT wait counter */
+static ptl_size_t gupcr_notify_le_count;
+/** Barrier notify MD handle */
+static ptl_handle_md_t gupcr_notify_md;
+/** Barrier notify MD EQ handle */
+static ptl_handle_eq_t gupcr_notify_md_eq;
+/** Barrier notify MD CT handle */
+static ptl_handle_ct_t gupcr_notify_md_ct;
+/** Barrier notify MD CT wait counter */
+static ptl_size_t gupcr_notify_md_count;
+
+/** Barrier wait memory buffer pointer.  The buffer is
+    mapped by a Portals LE for external access, and a Portals
+    MD for internal access.  A pointer to the buffer is needed
+    because the broadcast implementation used internally by the
+    UPC runtime can broadcast arbitrarily sized values (that may
+    be larger than an 'int'.  */
+static int *gupcr_wait_ptr;
+/** Barrier wait LE handle (appended to GUPCR_PTL_PTE_BARRIER_DOWN) */
+static ptl_handle_le_t gupcr_wait_le;
+/** Barrier wait LE CT handle */
+static ptl_handle_ct_t gupcr_wait_le_ct;
+/** Barrier wait LE CT wait counter */
+static ptl_size_t gupcr_wait_le_count;
+/** Barrier wait LE EQ handle */
+static ptl_handle_eq_t gupcr_wait_le_eq;
+/** Barrier wait MD handle */
+static ptl_handle_md_t gupcr_wait_md;
+/** Barrier wait MD CT handle */
+static ptl_handle_ct_t gupcr_wait_md_ct;
+/** Barrier wait MD CT wait counter */
+static ptl_size_t gupcr_wait_md_count;
+/** Barrier wait MD EQ handle */
+static ptl_handle_eq_t gupcr_wait_md_eq;
+
+/** Memory storage (notify source) that holds the barrier ID for the PTL_MIN
+    atomic function used in the notify phase of the barrier.  */
+static int gupcr_barrier_value;
+/** Barrier MD handle for the notify source */
+static ptl_handle_md_t gupcr_barrier_md;
+/** Barrier CT handle for the notify source */
+static ptl_handle_ct_t gupcr_barrier_md_ct;
+/** Barrier CT handle for the notify source wait counter */
+static ptl_size_t gupcr_barrier_md_count;
+/** Barrier EQ handle for the notify source */
+static ptl_handle_eq_t gupcr_barrier_md_eq;
+
+/** Memory storage that holds the maximum barrier ID value used to
+    re-initialize the memory storage for the notify barrier ID.  */
+static int gupcr_barrier_max_value = BARRIER_ID_MAX;
+/** Barrier MD for MAX re-init */
+static ptl_handle_md_t gupcr_barrier_max_md;
+/** Barrier CT handle for MAX re-init */
+static ptl_handle_ct_t gupcr_barrier_max_md_ct;
+/** Barrier CT handle for MAX re-init wait counter */
+static ptl_size_t gupcr_barrier_max_md_count;
+/** Barrier EQ handle for MAX re-init */
+static ptl_handle_eq_t gupcr_barrier_max_md_eq;
+
+/**
+ * @fn __upc_notify (int barrier_id)
+ * UPC <i>upc_notify<i> statement implementation
+ *
+ * This procedure sets the necessary Portals triggers to implement
+ * the pass that derives a consensus barrier ID value across all
+ * UPC threads.  The inner threads use Portals triggered operations
+ * to pass the barrier ID negotiated among itself and its children
+ * up the tree its parent.
+ * @param [in] barrier_id Barrier ID
+ */
+void
+__upc_notify (int barrier_id)
+{
+  ptl_process_t rpid __attribute ((unused));
+
+  gupcr_trace (FC_BARRIER, "BARRIER NOTIFY ENTER %d", barrier_id);
+
+  if (gupcr_barrier_active)
+    gupcr_error ("two successive upc_notify statements executed "
+		 "without an intervening upc_wait");
+  gupcr_barrier_active = 1;
+  gupcr_barrier_id = barrier_id;
+
+  /* The UPC shared memory consistency model requires all outstanding
+     read/write operations to complete on the thread's
+     current synchronization phase.  */
+  gupcr_gmem_sync ();
+
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+  if (THREADS == 1)
+    return;
+
+  /* Use barrier MAX number if barrier ID is "match all"
+     This effectively excludes the thread from setting the min ID
+     among the threads.  */
+  gupcr_barrier_value = (barrier_id == BARRIER_ANONYMOUS) ?
+    BARRIER_ID_MAX : barrier_id;
+
+  if (gupcr_debug_enabled (FC_BARRIER))
+    {
+      ptl_ct_event_t ct;
+      gupcr_portals_call (PtlCTGet, (gupcr_wait_le_ct, &ct));
+      gupcr_debug (FC_BARRIER, "Wait LE counter: %lu (%lu)",
+		   (long unsigned) ct.success,
+		   (long unsigned) gupcr_wait_le_count);
+      gupcr_portals_call (PtlCTGet, (gupcr_wait_md_ct, &ct));
+      gupcr_debug (FC_BARRIER, "Wait MD counter: %lu (%lu)",
+		   (long unsigned) ct.success,
+		   (long unsigned) gupcr_wait_md_count);
+      gupcr_portals_call (PtlCTGet, (gupcr_notify_le_ct, &ct));
+      gupcr_debug (FC_BARRIER, "Notify LE counter: %lu (%lu)",
+		   (long unsigned) ct.success,
+		   (long unsigned) gupcr_notify_le_count);
+      gupcr_portals_call (PtlCTGet, (gupcr_notify_md_ct, &ct));
+      gupcr_debug (FC_BARRIER, "Notify MD counter: %lu (%lu)",
+		   (long unsigned) ct.success,
+		   (long unsigned) gupcr_notify_md_count);
+      gupcr_portals_call (PtlCTGet, (gupcr_barrier_md_ct, &ct));
+      gupcr_debug (FC_BARRIER, "Barrier MD counter: %lu (%lu)",
+		   (long unsigned) ct.success,
+		   (long unsigned) gupcr_barrier_md_count);
+      gupcr_portals_call (PtlCTGet, (gupcr_barrier_max_md_ct, &ct));
+      gupcr_debug (FC_BARRIER, "Barrier max MD counter: %lu (%lu)",
+		   (long unsigned) ct.success,
+		   (long unsigned) gupcr_barrier_max_md_count);
+    }
+
+  if (LEAF_THREAD)
+    {
+      /* Send the barrier ID to the parent - use atomic PTL_MIN to allow
+         parent to find the minimum barrier ID among itself and its
+         children.  */
+      gupcr_debug (FC_BARRIER, "Send atomic PTL_MIN %d to (%d)",
+		   gupcr_barrier_value, gupcr_parent_thread);
+      rpid.rank = gupcr_parent_thread;
+      gupcr_portals_call (PtlAtomic, (gupcr_barrier_md, 0,
+				      BARRIER_ID_SIZE, PTL_NO_ACK_REQ,
+				      rpid, GUPCR_PTL_PTE_BARRIER_UP,
+				      PTL_NO_MATCH_BITS, 0, PTL_NULL_USER_PTR,
+				      PTL_NULL_HDR_DATA, PTL_MIN,
+				      PTL_INT32_T));
+    }
+  else
+    {
+      int i;
+      if (ROOT_THREAD)
+	{
+	  /* The consensus MIN barrier ID derived in the notify (UP) phase
+	     must be transferred to the wait LE for delivery to all children.
+	     Trigger: Barrier ID received in the notify phase.
+	     Action: Send the barrier ID to the wait buffer of the
+	     barrier DOWN LE.  */
+	  rpid.rank = MYTHREAD;
+	  gupcr_notify_le_count += gupcr_child_cnt + 1;
+	  gupcr_portals_call (PtlTriggeredPut, (gupcr_notify_md, 0,
+						BARRIER_ID_SIZE,
+						PTL_NO_ACK_REQ, rpid,
+						GUPCR_PTL_PTE_BARRIER_DOWN,
+						PTL_NO_MATCH_BITS, 0,
+						PTL_NULL_USER_PTR,
+						PTL_NULL_HDR_DATA,
+						gupcr_notify_le_ct,
+						gupcr_notify_le_count));
+
+	}
+      else
+	{
+	  /* The consensus MIN barrier ID of the inner thread and its children
+	     is sent to the parent UPC thread.
+	     Trigger: All children and this thread execute an atomic PTL_MIN
+	     using each thread's UP LE.
+	     Action: Transfer the consensus minimum barrier ID to the
+	     this thread's parent.  */
+	  rpid.rank = gupcr_parent_thread;
+	  gupcr_notify_le_count += gupcr_child_cnt + 1;
+	  gupcr_portals_call (PtlTriggeredAtomic, (gupcr_notify_md, 0,
+						   BARRIER_ID_SIZE,
+						   PTL_NO_ACK_REQ, rpid,
+						   GUPCR_PTL_PTE_BARRIER_UP,
+						   PTL_NO_MATCH_BITS, 0,
+						   PTL_NULL_USER_PTR,
+						   PTL_NULL_HDR_DATA,
+						   PTL_MIN, PTL_INT32_T,
+						   gupcr_notify_le_ct,
+						   gupcr_notify_le_count));
+	}
+
+      /* Trigger: Barrier ID received in the wait buffer.
+         Action: Reinitialize the barrier UP ID to barrier MAX value
+         for the next call to upc_notify.  */
+      rpid.rank = MYTHREAD;
+      gupcr_wait_le_count += 1;
+      gupcr_portals_call (PtlTriggeredPut, (gupcr_barrier_max_md, 0,
+					    BARRIER_ID_SIZE,
+					    PTL_NO_ACK_REQ, rpid,
+					    GUPCR_PTL_PTE_BARRIER_UP,
+					    PTL_NO_MATCH_BITS, 0,
+					    PTL_NULL_USER_PTR,
+					    PTL_NULL_HDR_DATA,
+					    gupcr_wait_le_ct,
+					    gupcr_wait_le_count));
+
+      /* Trigger: The barrier ID is reinitialized to MAX.
+         Action: Send the consensus barrier ID to all children.  */
+      gupcr_notify_le_count += 1;
+      for (i = 0; i < gupcr_child_cnt; i++)
+	{
+	  rpid.rank = gupcr_child[i];
+	  gupcr_portals_call (PtlTriggeredPut, (gupcr_wait_md, 0,
+						BARRIER_ID_SIZE,
+						PTL_OC_ACK_REQ, rpid,
+						GUPCR_PTL_PTE_BARRIER_DOWN,
+						PTL_NO_MATCH_BITS, 0,
+						PTL_NULL_USER_PTR,
+						PTL_NULL_HDR_DATA,
+						gupcr_notify_le_ct,
+						gupcr_notify_le_count));
+	}
+
+      /* Allow notify to proceed and to possibly complete the wait
+         phase on other threads.  */
+
+      /* Find the minimum barrier ID among children and the root.  */
+      gupcr_debug (FC_BARRIER, "Send atomic PTL_MIN %d to (%d)",
+		   gupcr_barrier_value, MYTHREAD);
+      rpid.rank = MYTHREAD;
+      gupcr_portals_call (PtlAtomic, (gupcr_barrier_md, 0,
+				      BARRIER_ID_SIZE, PTL_NO_ACK_REQ,
+				      rpid, GUPCR_PTL_PTE_BARRIER_UP,
+				      PTL_NO_MATCH_BITS, 0, PTL_NULL_USER_PTR,
+				      PTL_NULL_HDR_DATA, PTL_MIN,
+				      PTL_INT32_T));
+    }
+#else
+  /* The UPC runtime barrier implementation that does not use
+     Portals triggered operations does not support split phase barriers.
+     In this case, all Portals actions related to the barrier
+     are performed in the __upc_wait() function.  */
+#endif
+  gupcr_trace (FC_BARRIER, "BARRIER NOTIFY EXIT %d", barrier_id);
+}
+
+/**
+ * @fn __upc_wait (int barrier_id)
+ * UPC <i>upc_wait</i> statement implementation
+ *
+ * This procedure waits to receive the derived consensus
+ * barrier ID from the parent (leaf thread) or acknowledges that
+ * all children received the consensus barrier ID (inner
+ * and root threads).  The consensus barrier ID is checked
+ * against the barrier ID passed in as an argument.
+ * @param [in] barrier_id Barrier ID
+ */
+void
+__upc_wait (int barrier_id)
+{
+  ptl_ct_event_t ct;
+  ptl_process_t rpid __attribute ((unused));
+  int received_barrier_id;
+  gupcr_trace (FC_BARRIER, "BARRIER WAIT ENTER %d", barrier_id);
+
+  if (!gupcr_barrier_active)
+    gupcr_error ("upc_wait statement executed without a "
+		 "preceding upc_notify");
+
+  /* Check if notify/wait barrier IDs match.
+     BARRIER_ANONYMOUS matches any other barrier ID.  */
+  if ((barrier_id != BARRIER_ANONYMOUS &&
+       gupcr_barrier_id != BARRIER_ANONYMOUS) &&
+      (gupcr_barrier_id != barrier_id))
+    {
+      gupcr_error ("UPC barrier identifier mismatch - notify %d, wait %d",
+		   gupcr_barrier_id, barrier_id);
+    }
+
+  if (THREADS == 1)
+    {
+      gupcr_barrier_active = 0;
+      return;
+    }
+
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+  /* Wait for the barrier ID to propagate down the tree.  */
+  if (gupcr_child_cnt)
+    {
+      /* Wait for the barrier ID to flow down to the children.  */
+      gupcr_wait_md_count += gupcr_child_cnt;
+      gupcr_portals_call (PtlCTWait,
+			  (gupcr_wait_md_ct, gupcr_wait_md_count, &ct));
+      if (ct.failure)
+	{
+	  gupcr_process_fail_events (gupcr_wait_md_eq);
+	  gupcr_fatal_error ("received an error on wait MD");
+	}
+    }
+  else
+    {
+      gupcr_wait_le_count += 1;
+      gupcr_portals_call (PtlCTWait,
+			  (gupcr_wait_le_ct, gupcr_wait_le_count, &ct));
+      if (ct.failure)
+	{
+	  gupcr_process_fail_events (gupcr_wait_le_eq);
+	  gupcr_fatal_error ("received an error on wait LE");
+	}
+    }
+  received_barrier_id = *gupcr_wait_ptr;
+#else
+  /* UPC Barrier implementation without Portals Triggered Functions.  */
+
+  /* NOTIFY - Propagate minimal barrier ID to the root thread.  */
+
+  /* Use the barrier maximum ID number if the barrier ID is "match all".
+     This effectively excludes the thread from setting the minimum ID
+     among the threads.  */
+  gupcr_barrier_value = (barrier_id == BARRIER_ANONYMOUS) ?
+    BARRIER_ID_MAX : barrier_id;
+
+  if (!LEAF_THREAD)
+    {
+      /* This step is performed by the root thread and inner threads.  */
+      /* Find the minimal barrier ID among the thread and children.
+         Use the Portals PTL_MIN atomic operation on the value
+	 in the notify LE.  */
+      gupcr_debug (FC_BARRIER, "Send atomic PTL_MIN %d to (%d)",
+		   gupcr_barrier_value, MYTHREAD);
+      rpid.rank = MYTHREAD;
+      gupcr_portals_call (PtlAtomic, (gupcr_barrier_md, 0,
+				      BARRIER_ID_SIZE, PTL_NO_ACK_REQ,
+				      rpid, GUPCR_PTL_PTE_BARRIER_UP,
+				      PTL_NO_MATCH_BITS, 0, PTL_NULL_USER_PTR,
+				      PTL_NULL_HDR_DATA, PTL_MIN,
+				      PTL_INT32_T));
+      /* Wait for all children threads to report their barrier IDs.
+         Account for this thread's atomic PTL_MIN.  */
+      gupcr_notify_le_count += gupcr_child_cnt + 1;
+      gupcr_portals_call (PtlCTWait,
+			  (gupcr_notify_le_ct, gupcr_notify_le_count, &ct));
+      if (ct.failure)
+	{
+	  gupcr_process_fail_events (gupcr_notify_le_eq);
+	  gupcr_fatal_error ("received an error on notify LE");
+	}
+    }
+
+  if (!ROOT_THREAD)
+    {
+      ptl_handle_md_t source_md;
+
+      /* This step is performed by leaf threads and inner threads.  */
+      /* Send the barrier ID to the parent - use atomic PTL_MIN on the value
+         in the parents notify LE (derived minimal ID for the parent and its
+         children.  */
+      gupcr_debug (FC_BARRIER, "Send atomic PTL_MIN %d to (%d)",
+		   gupcr_barrier_value, gupcr_parent_thread);
+      if (LEAF_THREAD)
+	source_md = gupcr_barrier_md;
+      else
+	/* An inner thread uses the minimal barrier ID
+	   derived from the parent thread and all its children.  */
+	source_md = gupcr_notify_md;
+      rpid.rank = gupcr_parent_thread;
+      gupcr_portals_call (PtlAtomic,
+			  (source_md, 0, BARRIER_ID_SIZE, PTL_NO_ACK_REQ,
+			   rpid, GUPCR_PTL_PTE_BARRIER_UP,
+			   PTL_NO_MATCH_BITS, 0, PTL_NULL_USER_PTR,
+			   PTL_NULL_HDR_DATA, PTL_MIN, PTL_INT32_T));
+    }
+
+  /* At this point, the derived minimal barrier ID among all threads
+     has arrived at the root thread.  */
+  if (ROOT_THREAD)
+    {
+      *(int *) gupcr_wait_ptr = gupcr_notify_value;
+    }
+  else
+    {
+      /* Wait for the parent to send the derived agreed on barrier ID.  */
+      gupcr_wait_le_count += 1;
+      gupcr_portals_call (PtlCTWait,
+			  (gupcr_wait_le_ct, gupcr_wait_le_count, &ct));
+      if (ct.failure)
+	{
+	  gupcr_process_fail_events (gupcr_wait_le_eq);
+	  gupcr_fatal_error ("received an error on wait LE");
+	}
+    }
+
+  received_barrier_id = gupcr_notify_value;
+
+  /* An inner thread sends the derived consensus
+     minimum barrier ID to its children.  */
+  if (!LEAF_THREAD)
+    {
+      int i;
+
+      /* Re-initialize the barrier ID maximum range value.  */
+      gupcr_notify_value = BARRIER_ID_MAX;
+
+      /* Send the derived consensus minimum barrier ID to
+         this thread's children.  */
+      for (i = 0; i < gupcr_child_cnt; i++)
+	{
+	  rpid.rank = gupcr_child[i];
+	  gupcr_portals_call (PtlPut,
+			      (gupcr_wait_md, 0, BARRIER_ID_SIZE,
+			       PTL_OC_ACK_REQ, rpid,
+			       GUPCR_PTL_PTE_BARRIER_DOWN, PTL_NO_MATCH_BITS,
+			       0, PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA));
+	}
+
+      /* Wait until all children receive the consensus minimum
+         barrier ID that is propagated down the tree.  */
+      gupcr_wait_md_count += gupcr_child_cnt;
+      gupcr_portals_call (PtlCTWait,
+			  (gupcr_wait_md_ct, gupcr_wait_md_count, &ct));
+      if (ct.failure)
+	{
+	  gupcr_process_fail_events (gupcr_wait_md_eq);
+	  gupcr_fatal_error ("received an error on wait MD");
+	}
+    }
+
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+  /* Verify that the barrier ID matches.  */
+  if (barrier_id != INT_MIN &&
+      barrier_id != received_barrier_id &&
+      received_barrier_id != BARRIER_ID_MAX)
+    gupcr_error ("thread %d: UPC barrier identifier mismatch among threads - "
+		 "expected %d, received %d",
+		 MYTHREAD, barrier_id, received_barrier_id);
+
+  /* UPC Shared Memory Consistency Model requires all outstanding
+     read/write operations to complete on the thread's enter
+     into the next synchronization phase.  */
+  gupcr_gmem_sync ();
+
+  gupcr_barrier_active = 0;
+
+  gupcr_trace (FC_BARRIER, "BARRIER WAIT EXIT %d", barrier_id);
+}
+
+/**
+ * @fn __upc_barrier (int barrier_id)
+ * UPC language upc_barrier implementation.
+ *
+ * @param [in] barrier_id Barrier ID
+ */
+void
+__upc_barrier (int barrier_id)
+{
+  __upc_notify (barrier_id);
+  __upc_wait (barrier_id);
+}
+
+/* This Portals4 based broadcast implementation uses barrier resources
+ * to pass the broadcast message from thread 0 to all other threads.  */
+
+/**
+ * @fn gupcr_bcast_send (void *value, size_t nbytes)
+ * Send broadcast message to all thread's children.
+ *
+ * The broadcast is a collective operation where thread 0 (root thread)
+ * sends a message to all other threads.  This function must be
+ * called by the thread 0 only from a public function
+ * "gupcr_broadcast_put".
+ *
+ * @param [in] value Pointer to send value
+ * @param [in] nbytes Number of bytes to send
+ * @ingroup BROADCAST
+ */
+void
+gupcr_bcast_send (void *value, size_t nbytes)
+{
+  int i;
+  ptl_process_t rpid;
+  ptl_ct_event_t ct;
+
+  gupcr_trace (FC_BROADCAST, "BROADCAST SEND ENTER 0x%lx %lu",
+	       (long unsigned) value, (long unsigned) nbytes);
+
+  /* This broadcast operation is implemented a collective operation.
+     Before proceeding, complete all outstanding shared memory
+     read/write operations.  */
+  gupcr_gmem_sync ();
+
+  /* Copy the message into the buffer used for delivery
+     to the children threads.  */
+  memcpy (gupcr_wait_ptr, value, nbytes);
+
+  gupcr_notify_le_count += gupcr_child_cnt;
+  gupcr_portals_call (PtlCTWait,
+		      (gupcr_notify_le_ct, gupcr_notify_le_count, &ct));
+  if (ct.failure)
+    {
+      gupcr_process_fail_events (gupcr_notify_le_eq);
+      gupcr_fatal_error ("received an error on notify LE");
+    }
+
+  /* Send broadcast to this thread's children.  */
+  for (i = 0; i < gupcr_child_cnt; i++)
+    {
+      rpid.rank = gupcr_child[i];
+      gupcr_debug (FC_BROADCAST, "Send broadcast message to child (%d)",
+		   gupcr_child[i]);
+      gupcr_portals_call (PtlPut, (gupcr_wait_md, 0,
+				   nbytes, PTL_ACK_REQ, rpid,
+				   GUPCR_PTL_PTE_BARRIER_DOWN,
+				   PTL_NO_MATCH_BITS, 0, PTL_NULL_USER_PTR,
+				   PTL_NULL_HDR_DATA));
+    }
+
+  /* Wait for message delivery to all children.  This ensures that
+     the source buffer is not overwritten by back-to-back
+     broadcast operations.  */
+  gupcr_wait_md_count += gupcr_child_cnt;
+  gupcr_portals_call (PtlCTWait,
+		      (gupcr_wait_md_ct, gupcr_wait_md_count, &ct));
+  if (ct.failure)
+    {
+      gupcr_process_fail_events (gupcr_wait_md_eq);
+      gupcr_fatal_error ("received an error on wait MD");
+    }
+  gupcr_trace (FC_BROADCAST, "BROADCAST SEND EXIT");
+}
+
+/**
+ * @fn gupcr_bcast_recv (void *value, size_t nbytes)
+ * Wait to receive the broadcast message and return its value.
+ *
+ * Broadcast is a collective operation where thread 0 (the root thread)
+ * sends a message to all other threads.  This function must be
+ * called by every thread other then thread 0.
+ *
+ * @param [in] value Pointer to received value
+ * @param [in] nbytes Number of bytes to receive
+ * @ingroup BROADCAST
+ */
+void
+gupcr_bcast_recv (void *value, size_t nbytes)
+{
+  int i;
+  ptl_process_t rpid;
+  ptl_ct_event_t ct;
+
+  gupcr_trace (FC_BROADCAST, "BROADCAST RECV ENTER 0x%lx %lu",
+	       (long unsigned) value, (long unsigned) nbytes);
+
+  gupcr_gmem_sync ();
+
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+  if (INNER_THREAD)
+    {
+      /* Prepare triggers for message push to all children.  */
+      gupcr_wait_le_count += 1;
+      for (i = 0; i < gupcr_child_cnt; i++)
+	{
+	  rpid.rank = gupcr_child[i];
+	  gupcr_debug (FC_BROADCAST,
+		       "Set broadcast trigger to the child (%d)",
+		       gupcr_child[i]);
+	  /* Trigger: message received from the parent.
+	     Action: send the message to the child.  */
+	  gupcr_portals_call (PtlTriggeredPut, (gupcr_wait_md, 0,
+						nbytes, PTL_ACK_REQ, rpid,
+						GUPCR_PTL_PTE_BARRIER_DOWN,
+						PTL_NO_MATCH_BITS, 0,
+						PTL_NULL_USER_PTR,
+						PTL_NULL_HDR_DATA,
+						gupcr_wait_le_ct,
+						gupcr_wait_le_count));
+	}
+
+      /* Prepare a trigger to send notification to the parent.  */
+      gupcr_debug (FC_BROADCAST,
+		   "Set notification trigger to the parent (%d)",
+		   gupcr_parent_thread);
+      rpid.rank = gupcr_parent_thread;
+      gupcr_barrier_value = BARRIER_ID_MAX;
+      /* Trigger: notification received from all children.
+         Action: send notification to the parent.  */
+      gupcr_notify_le_count += gupcr_child_cnt;
+      gupcr_portals_call (PtlTriggeredPut, (gupcr_barrier_md, 0,
+					    BARRIER_ID_SIZE,
+					    PTL_NO_ACK_REQ, rpid,
+					    GUPCR_PTL_PTE_BARRIER_UP,
+					    PTL_NO_MATCH_BITS, 0,
+					    PTL_NULL_USER_PTR,
+					    PTL_NULL_HDR_DATA,
+					    gupcr_notify_le_ct,
+					    gupcr_notify_le_count));
+
+      /* Wait for delivery to all children.  */
+      gupcr_wait_md_count += gupcr_child_cnt;
+      gupcr_portals_call (PtlCTWait,
+			  (gupcr_wait_md_ct, gupcr_wait_md_count, &ct));
+      if (ct.failure)
+	{
+	  gupcr_process_fail_events (gupcr_wait_md_eq);
+	  gupcr_fatal_error ("received an error on wait MD");
+	}
+      gupcr_debug (FC_BROADCAST, "Received PtlPut acks: %lu",
+                   (long unsigned) ct.success);
+    }
+  else
+    {
+      /* A leaf thread sends notification to its parent that
+         it is ready to receive the broadcast value.  */
+      gupcr_debug (FC_BROADCAST, "Send notification to the parent (%d)",
+		   gupcr_parent_thread);
+      rpid.rank = gupcr_parent_thread;
+      gupcr_barrier_value = BARRIER_ID_MAX;
+      gupcr_portals_call (PtlPut, (gupcr_barrier_md, 0,
+				   BARRIER_ID_SIZE, PTL_NO_ACK_REQ, rpid,
+				   GUPCR_PTL_PTE_BARRIER_UP,
+				   PTL_NO_MATCH_BITS, 0, PTL_NULL_USER_PTR,
+				   PTL_NULL_HDR_DATA));
+
+      /* Wait to receive a message from the parent.  */
+      gupcr_wait_le_count += 1;
+      gupcr_portals_call (PtlCTWait,
+			  (gupcr_wait_le_ct, gupcr_wait_le_count, &ct));
+      if (ct.failure)
+	{
+	  gupcr_process_fail_events (gupcr_wait_le_eq);
+	  gupcr_fatal_error ("received an error on wait LE");
+	}
+    }
+  memcpy (value, gupcr_wait_ptr, nbytes);
+#else
+  /* Inner threads must wait for its children threads to arrive.  */
+  if (INNER_THREAD)
+    {
+      gupcr_debug (FC_BROADCAST, "Waiting for %d notifications",
+		   gupcr_child_cnt);
+      gupcr_notify_le_count += gupcr_child_cnt;
+      gupcr_portals_call (PtlCTWait,
+			  (gupcr_notify_le_ct, gupcr_child_cnt, &ct));
+      if (ct.failure)
+	{
+	  gupcr_process_fail_events (gupcr_notify_le_eq);
+	  gupcr_fatal_error ("received an error on notify LE");
+	}
+      gupcr_debug (FC_BROADCAST, "Received %lu broadcast notifications",
+		   (long unsigned) ct.success);
+    }
+
+  /* Inform the parent that this thread and all its children arrived.
+     Send barrier MAX value as we share PTEs with the barrier
+     implementation.  */
+  gupcr_debug (FC_BROADCAST, "Send notification to the parent %d",
+	       gupcr_parent_thread);
+  rpid.rank = gupcr_parent_thread;
+  gupcr_barrier_value = BARRIER_ID_MAX;
+  gupcr_portals_call (PtlPut, (gupcr_barrier_md, 0,
+			       BARRIER_ID_SIZE, PTL_NO_ACK_REQ, rpid,
+			       GUPCR_PTL_PTE_BARRIER_UP, PTL_NO_MATCH_BITS, 0,
+			       PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA));
+
+  /* Receive the broadcast message from the parent.  */
+  gupcr_wait_le_count += 1;
+  gupcr_portals_call (PtlCTWait,
+		      (gupcr_wait_le_ct, gupcr_wait_le_count, &ct));
+  if (ct.failure)
+    {
+      gupcr_process_fail_events (gupcr_wait_le_eq);
+      gupcr_fatal_error ("received an error on wait LE");
+    }
+
+  /* Copy the received message.  */
+  memcpy (value, gupcr_wait_ptr, nbytes);
+
+  if (INNER_THREAD)
+    {
+      /* An inner thread must pass the message to its children.  */
+      for (i = 0; i < gupcr_child_cnt; i++)
+	{
+	  gupcr_debug (FC_BROADCAST, "Sending a message to %d",
+		       gupcr_child[i]);
+	  rpid.rank = gupcr_child[i];
+	  gupcr_portals_call (PtlPut, (gupcr_wait_md, 0,
+				       nbytes, PTL_ACK_REQ, rpid,
+				       GUPCR_PTL_PTE_BARRIER_DOWN,
+				       PTL_NO_MATCH_BITS, 0,
+				       PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA));
+	}
+      /* Wait for delivery to all children.  */
+      gupcr_wait_md_count += gupcr_child_cnt;
+      gupcr_portals_call (PtlCTWait, (gupcr_wait_md_ct, gupcr_wait_md_count,
+				      &ct));
+      if (ct.failure)
+	{
+	  gupcr_process_fail_events (gupcr_wait_md_eq);
+          gupcr_fatal_error ("received an error on wait MD");
+	}
+    }
+#endif
+  gupcr_trace (FC_BROADCAST, "BROADCAST RECV EXIT");
+}
+
+/**
+ * @fn gupcr_barrier_init (void)
+ * Initialize barrier resources.
+ * @ingroup INIT
+ */
+void
+gupcr_barrier_init (void)
+{
+  ptl_pt_index_t pte;
+  ptl_le_t le;
+  ptl_md_t md;
+
+  gupcr_log (FC_BARRIER, "barrier init called");
+
+  /* Create necessary CT handles.  */
+  gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_notify_le_ct));
+  gupcr_notify_le_count = 0;
+  gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_notify_md_ct));
+  gupcr_notify_md_count = 0;
+  gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_wait_le_ct));
+  gupcr_wait_le_count = 0;
+  gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_wait_md_ct));
+  gupcr_wait_md_count = 0;
+  gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_barrier_md_ct));
+  gupcr_barrier_md_count = 0;
+  gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_barrier_max_md_ct));
+  gupcr_barrier_max_md_count = 0;
+
+  /* Create necessary EQ handles.  Allocate only one event queue entry
+     as we abort on any error.  */
+  gupcr_portals_call (PtlEQAlloc, (gupcr_ptl_ni, 1, &gupcr_notify_le_eq));
+  gupcr_portals_call (PtlEQAlloc, (gupcr_ptl_ni, 1, &gupcr_notify_md_eq));
+  gupcr_portals_call (PtlEQAlloc, (gupcr_ptl_ni, 1, &gupcr_wait_le_eq));
+  gupcr_portals_call (PtlEQAlloc, (gupcr_ptl_ni, 1, &gupcr_wait_md_eq));
+  gupcr_portals_call (PtlEQAlloc, (gupcr_ptl_ni, 1, &gupcr_barrier_md_eq));
+  gupcr_portals_call (PtlEQAlloc,
+		      (gupcr_ptl_ni, 1, &gupcr_barrier_max_md_eq));
+
+  /* Allocate PTEs.  */
+  gupcr_portals_call (PtlPTAlloc, (gupcr_ptl_ni, 0,
+				   gupcr_notify_le_eq,
+				   GUPCR_PTL_PTE_BARRIER_UP, &pte));
+  if (pte != GUPCR_PTL_PTE_BARRIER_UP)
+    gupcr_fatal_error ("cannot allocate GUPCR_PTL_PTE_BARRIER_UP PTE");
+  gupcr_debug (FC_BARRIER, "Barrier UP PTE allocated: %d",
+	       GUPCR_PTL_PTE_BARRIER_UP);
+  gupcr_portals_call (PtlPTAlloc, (gupcr_ptl_ni, 0,
+				   gupcr_wait_le_eq,
+				   GUPCR_PTL_PTE_BARRIER_DOWN, &pte));
+  if (pte != GUPCR_PTL_PTE_BARRIER_DOWN)
+    gupcr_fatal_error ("cannot allocate GUPCR_PTL_PTE_BARRIER_DOWN PTE");
+  gupcr_debug (FC_BARRIER, "Barrier DOWN PTE allocated: %d",
+	       GUPCR_PTL_PTE_BARRIER_DOWN);
+
+  /* Children perform atomic MIN on up_value,
+     make sure we start with the maximum possible value.  */
+  gupcr_notify_value = BARRIER_ID_MAX;
+
+  /* Create LE for barrier ID value traveling up the tree.  */
+  le.start = &gupcr_notify_value;
+  le.length = sizeof (gupcr_notify_value);
+  le.ct_handle = gupcr_notify_le_ct;
+  le.uid = PTL_UID_ANY;
+  le.options = PTL_LE_OP_PUT | PTL_LE_OP_GET |
+    PTL_LE_EVENT_CT_COMM | PTL_LE_EVENT_SUCCESS_DISABLE |
+    PTL_LE_EVENT_LINK_DISABLE;
+  gupcr_portals_call (PtlLEAppend,
+		      (gupcr_ptl_ni, GUPCR_PTL_PTE_BARRIER_UP, &le,
+		       PTL_PRIORITY_LIST, NULL, &gupcr_notify_le));
+
+  /* Create LE for barrier ID value traveling down the tree.
+     Allocate enough space as barrier resources are
+     used to also broadcast arbitrary values.  */
+  gupcr_malloc (gupcr_wait_ptr, GUPCR_MAX_BROADCAST_SIZE);
+  le.start = gupcr_wait_ptr;
+  le.length = GUPCR_MAX_BROADCAST_SIZE;
+  le.ct_handle = gupcr_wait_le_ct;
+  le.uid = PTL_UID_ANY;
+  le.options = PTL_LE_OP_PUT | PTL_LE_OP_GET |
+    PTL_LE_EVENT_CT_COMM | PTL_LE_EVENT_SUCCESS_DISABLE |
+    PTL_LE_EVENT_LINK_DISABLE;
+  gupcr_portals_call (PtlLEAppend,
+		      (gupcr_ptl_ni, GUPCR_PTL_PTE_BARRIER_DOWN, &le,
+		       PTL_PRIORITY_LIST, NULL, &gupcr_wait_le));
+
+  /* Create source MD for barrier ID values sent up the tree.  */
+  md.start = &gupcr_notify_value;
+  md.length = sizeof (gupcr_notify_value);
+  md.options = PTL_MD_EVENT_CT_ACK | PTL_MD_EVENT_SUCCESS_DISABLE;
+  md.eq_handle = gupcr_notify_md_eq;
+  md.ct_handle = gupcr_notify_md_ct;
+  gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_notify_md));
+
+  /* Create source MD for barrier ID values sent down the tree.  */
+  md.start = gupcr_wait_ptr;
+  md.length = GUPCR_MAX_BROADCAST_SIZE;
+  md.options = PTL_MD_EVENT_CT_ACK | PTL_MD_EVENT_SUCCESS_DISABLE;
+  md.eq_handle = gupcr_wait_md_eq;
+  md.ct_handle = gupcr_wait_md_ct;
+  gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_wait_md));
+
+  /* Create source MD for barrier ID values sent up the tree.  */
+  md.start = &gupcr_barrier_value;
+  md.length = sizeof (gupcr_barrier_value);
+  md.options = PTL_MD_EVENT_CT_ACK | PTL_MD_EVENT_SUCCESS_DISABLE;
+  md.eq_handle = gupcr_barrier_md_eq;
+  md.ct_handle = gupcr_barrier_md_ct;
+  gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_barrier_md));
+
+  /* Create source MD that is used re-initialize the
+     the consensus minimum barrier ID value to a maximum
+     possible value.  */
+  md.start = &gupcr_barrier_max_value;
+  md.length = sizeof (gupcr_barrier_max_value);
+  md.options = PTL_MD_EVENT_CT_ACK | PTL_MD_EVENT_SUCCESS_DISABLE;
+  md.eq_handle = gupcr_barrier_max_md_eq;
+  md.ct_handle = gupcr_barrier_max_md_ct;
+  gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_barrier_max_md));
+}
+
+/**
+ * @fn gupcr_barrier_fini (void)
+ * Release barrier resources.
+ * @ingroup INIT
+ */
+void
+gupcr_barrier_fini (void)
+{
+  gupcr_log (FC_BARRIER, "barrier fini called");
+
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+  /* Cancel any outstanding triggered operations.  */
+  gupcr_portals_call (PtlCTCancelTriggered, (gupcr_wait_le_ct));
+  gupcr_portals_call (PtlCTCancelTriggered, (gupcr_barrier_max_md_ct));
+  gupcr_portals_call (PtlCTCancelTriggered, (gupcr_notify_le_ct));
+  gupcr_portals_call (PtlCTCancelTriggered, (gupcr_wait_md_ct));
+#endif
+
+  /* Release MDs and their CTs.  */
+  gupcr_portals_call (PtlMDRelease, (gupcr_barrier_md));
+  gupcr_portals_call (PtlCTFree, (gupcr_barrier_md_ct));
+  gupcr_portals_call (PtlEQFree, (gupcr_barrier_md_eq));
+  gupcr_portals_call (PtlMDRelease, (gupcr_barrier_max_md));
+  gupcr_portals_call (PtlCTFree, (gupcr_barrier_max_md_ct));
+  gupcr_portals_call (PtlEQFree, (gupcr_barrier_max_md_eq));
+  gupcr_portals_call (PtlMDRelease, (gupcr_notify_md));
+  gupcr_portals_call (PtlCTFree, (gupcr_notify_md_ct));
+  gupcr_portals_call (PtlEQFree, (gupcr_notify_md_eq));
+  gupcr_portals_call (PtlMDRelease, (gupcr_wait_md));
+  gupcr_portals_call (PtlCTFree, (gupcr_wait_md_ct));
+  gupcr_portals_call (PtlEQFree, (gupcr_wait_md_eq));
+
+  /* Release LEs, their CTs, and PTEs.  */
+  gupcr_portals_call (PtlLEUnlink, (gupcr_notify_le));
+  gupcr_portals_call (PtlCTFree, (gupcr_notify_le_ct));
+  gupcr_portals_call (PtlEQFree, (gupcr_notify_le_eq));
+  gupcr_portals_call (PtlPTFree, (gupcr_ptl_ni, GUPCR_PTL_PTE_BARRIER_UP));
+
+  gupcr_portals_call (PtlLEUnlink, (gupcr_wait_le));
+  gupcr_portals_call (PtlCTFree, (gupcr_wait_le_ct));
+  gupcr_portals_call (PtlEQFree, (gupcr_wait_le_eq));
+  gupcr_portals_call (PtlPTFree, (gupcr_ptl_ni, GUPCR_PTL_PTE_BARRIER_DOWN));
+}
+
+/** @} */
Index: libgupc/portals4/gupcr_barrier.h
===================================================================
--- libgupc/portals4/gupcr_barrier.h	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_barrier.h	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,49 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef _GUPCR_BARRIER_H_
+#define _GUPCR_BARRIER_H_
+
+/**
+ * @file gupcr_barrier.h
+ * GUPC Portals4 barrier implementation.
+ *
+ * @addtogroup BARRIER GUPCR Barrier Functions
+ * @{
+ */
+
+extern void gupcr_barrier_init (void);
+extern void gupcr_barrier_fini (void);
+
+/* Broadcast support functions.  */
+extern void gupcr_bcast_send (void *, size_t);
+extern void gupcr_bcast_recv (void *, size_t);
+
+/* Current barrier ID.  */
+extern int gupcr_barrier_id;
+
+/** @} */
+#endif /* gupcr_barrier.h */
Index: libgupc/portals4/gupcr_broadcast.c
===================================================================
--- libgupc/portals4/gupcr_broadcast.c	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_broadcast.c	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,122 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_lib.h"
+#include "gupcr_sup.h"
+#include "gupcr_utils.h"
+#include "gupcr_barrier.h"
+#include "gupcr_broadcast.h"
+
+/**
+ * @file gupcr_broadcast.c
+ * GUPC Portals4 broadcast implementation.
+ *
+ * The broadcast utility functions are internal functions used by
+ * the UPC runtime when it is necessary to propagate (broadcast)
+ * a value from thread 0 to all other threads.
+ * For example, upc_all_alloc and upc_all_lock_alloc will
+ * call the broadcast functions to propagate their result to all threads.
+ *
+ * The broadcast functions use an algorithm that is a variant of
+ * the one used to implement a barrier.  The "up phase" signals
+ * that each thread is ready to receive the broadcast value, while the
+ * "down phase" is used to receive the actual value.
+ */
+
+/**
+ * @addtogroup BROADCAST GUPCR Broadcast Functions
+ * @{
+ */
+
+/**
+ * Receive the broadcast value.
+ *
+ * A thread signals to its parent (by writing to its parent's UP PTE)
+ * that it is ready to receive a broadcast value, and then waits on
+ * its down LE counting event until the message arrives.
+ * @param [in] value Pointer to received value
+ * @param [in] nbytes Number of bytes to receive
+ */
+void
+gupcr_broadcast_get (void *value, size_t nbytes)
+{
+  if (!MYTHREAD)
+    gupcr_fatal_error ("called from thread 0");
+  if (nbytes > GUPCR_MAX_BROADCAST_SIZE)
+    gupcr_fatal_error ("size of broadcast message (%ld) is greater then "
+		       "the maximum allowed (%d)",
+		       (long int) nbytes, GUPCR_MAX_BROADCAST_SIZE);
+  /* Wait to receive the broadcast message.  */
+  gupcr_bcast_recv (value, nbytes);
+}
+
+/**
+ * Send the broadcast value.  This function must be called
+ * only by thread 0 (the root thread).  The broadcast value
+ * is pushed down the tree by first sending the value
+ * to the children of the root thread.
+ *
+ * @param [in] value Pointer to send value
+ * @param [in] nbytes Number of bytes to send
+ */
+void
+gupcr_broadcast_put (void *value, size_t nbytes)
+{
+  if (THREADS == 1)
+    return;
+  if (MYTHREAD)
+    gupcr_fatal_error ("called from thread other then 0");
+  if (nbytes > GUPCR_MAX_BROADCAST_SIZE)
+    gupcr_fatal_error ("size of broadcast message (%ld) is greater then "
+		       "maximum allowed (%d)",
+		       (long int) nbytes, GUPCR_MAX_BROADCAST_SIZE);
+  /* Send the broadcast message to the children of the root thread.  */
+  gupcr_bcast_send (value, nbytes);
+}
+
+/**
+ * Initialize broadcast resources.
+ * @ingroup INIT
+ */
+void
+gupcr_broadcast_init (void)
+{
+  gupcr_log (FC_BROADCAST, "broadcast init called");
+}
+
+/**
+ * Release broadcast resources.
+ * @ingroup INIT
+ */
+void
+gupcr_broadcast_fini (void)
+{
+  gupcr_log (FC_BROADCAST, "broadcast fini called");
+}
+
+/** @} */
Index: libgupc/portals4/gupcr_broadcast.h
===================================================================
--- libgupc/portals4/gupcr_broadcast.h	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_broadcast.h	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,50 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef _GUPCR_BROADCAST_H_
+#define _GUPCR_BROADCAST_H_
+
+/**
+ * @file gupcr_broadcast.h
+ * GUPC Portals4 broadcast implementation.
+ */
+
+/**
+ * @addtogroup BROADCAST GUPCR Broadcast Functions
+ * @{
+ */
+
+/** Maximum message size that can be sent via broadcast.  */
+#define GUPCR_MAX_BROADCAST_SIZE 32
+
+/** @} */
+
+extern void gupcr_broadcast_get (void *value, size_t nbytes);
+extern void gupcr_broadcast_put (void *value, size_t nbytes);
+extern void gupcr_broadcast_init (void);
+extern void gupcr_broadcast_fini (void);
+
+#endif /* gupcr_broadcast.h */
Index: libgupc/portals4/gupcr_castable.upc
===================================================================
--- libgupc/portals4/gupcr_castable.upc	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_castable.upc	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,69 @@
+/* Copyright (C) 2013-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include <upc.h>
+#include <upc_castable.h>
+#include "gupcr_portals.h"
+#include "gupcr_pts.h"
+#include "gupcr_gmem.h"
+#include "gupcr_node.h"
+
+void *
+upc_cast (const shared void *ptr)
+{
+  const upc_shared_ptr_t sptr = GUPCR_PTS_TO_REP (ptr);
+  void *local_ptr = NULL;
+  if (!GUPCR_PTS_IS_NULL (sptr))
+    {
+      const size_t thread = GUPCR_PTS_THREAD (sptr);
+      const int thread_as_int = (int) thread;
+      if (thread_as_int >= THREADS)
+	gupcr_fatal_error ("thread number %d in shared address "
+	                   "is out of range", thread_as_int);
+      if (GUPCR_GMEM_IS_LOCAL (thread))
+	{
+	  size_t offset = GUPCR_PTS_OFFSET (sptr);
+	  local_ptr = GUPCR_GMEM_OFF_TO_LOCAL (thread, offset);
+	}
+    }
+  return local_ptr;
+}
+
+upc_thread_info_t
+upc_thread_info (size_t thread)
+{
+  const int thread_as_int = (int) thread;
+  upc_thread_info_t cast_info = { 0, 0 };
+  if (thread_as_int >= THREADS)
+    gupcr_fatal_error ("thread number %d in shared address "
+		       "is out of range", thread_as_int);
+  if (GUPCR_GMEM_IS_LOCAL (thread))
+    {
+      cast_info.guaranteedCastable = UPC_CASTABLE_ALL;
+      cast_info.probablyCastable = UPC_CASTABLE_ALL;
+    }
+  return cast_info;
+}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]