This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[UPC 20/22] libgupc runtime library [8/9]


[NOTE: Due to email list size limits, this patch is broken into 9 parts.]

Background
----------

An overview email, describing the UPC-related changes is here:
  https://gcc.gnu.org/ml/gcc-patches/2015-12/msg00005.html

The GUPC branch is described here:
  http://gcc.gnu.org/projects/gupc.html

The UPC-related source code differences are summarized here:
  http://gccupc.org/gupc-changes

All languages (c, c++, fortran, go, lto, objc, obj-c++) have been
bootstrapped; no test suite regressions were introduced,
relative to the GCC trunk.

If you are on the cc-list, your name was chosen either
because you are listed as a maintainer for the area that
applies to the patches described in this email, or you
were a frequent contributor of patches made to files listed
in this email.

In the change log entries included in each patch, the directory
containing the affected files is listed, followed by the files.
When the patches are applied, the change log entries will be
distributed to the appropriate ChangeLog file.

Overview
--------

Libgupc is the UPC runtime library, for GUPC.  The configuration,
makefile, and documentation related changes have been broken out into
separate patches.

As noted in the ChangeLog entry below, this is all new code.
Two communication layers are supported: (1) SMP, via 'mmap'
or (2) the Portals4 library API, which supports multi-node
operation.  Libgupc generally requires a POSIX-compliant target OS.

The 'smp' runtime is the default runtime.  The 'portals4'
runtime is experimental; it supports multi-node operation
using the Portals4 communications library.

Most of the libgupc/include/ directory contains standard headers
defined by the UPC language specification. 'make install' will
install these headers in the directory where other "C"
header files are located.

2015-11-30  Gary Funck  <gary@intrepid.com>

	libgupc/portals4/
	* gupcr_clock.c: New.
	* gupcr_coll_broadcast.upc: New.
	* gupcr_coll_init.upc: New.
	* gupcr_coll_reduce.upc: New.
	* gupcr_coll_sup.c: New.
	* gupcr_coll_sup.h: New.
	* gupcr_config.h: New.
	* gupcr_defs.h: New.
	* gupcr_env.c: New.
	* gupcr_gmem.c: New.
	* gupcr_gmem.h: New.
	* gupcr_lib.h: New.

Index: libgupc/portals4/gupcr_clock.c
===================================================================
--- libgupc/portals4/gupcr_clock.c	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_clock.c	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,138 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/**
+ * @file gupcr_clock.c
+ * GUPC Clock routines.
+ */
+
+/**
+ * @addtogroup UPCCLOCK UPC Clock Functions
+ * @{
+ */
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_utils.h"
+
+static double gupcr_clock_rez;
+static double gupcr_clock_base;
+
+#if HAVE_CLOCK_GETTIME
+
+#if defined (CLOCK_MONOTONIC_RAW) && defined (CLOCK_MONOTONIC)
+#define GUPCR_CLOCK_ID CLOCK_MONOTONIC_RAW
+/* On some RHEL/CentOS systems, the timer resolution returned for
+   CLOCK_MONOTONIC_RAW is incorrect.  Use CLOCK_MONOTONIC instead.  */
+#define GUPCR_CLOCK_REZ_ID CLOCK_MONOTONIC
+#elif defined (CLOCK_MONOTONIC)
+#define GUPCR_CLOCK_ID CLOCK_MONOTONIC
+#define GUPCR_CLOCK_REZ_ID CLOCK_MONOTONIC
+#else
+#error missing system clock name definition.
+#endif
+
+double
+gupcr_clock (void)
+{
+  struct timespec ts;
+  double t;
+  gupcr_syscall (clock_gettime, (GUPCR_CLOCK_ID, &ts));
+  t = (double) ts.tv_sec + (double) ts.tv_nsec * 1.0e-9;
+  t -= gupcr_clock_base;
+  return t;
+}
+
+double
+gupcr_clock_resolution (void)
+{
+  return gupcr_clock_rez;
+}
+
+void
+gupcr_clock_init (void)
+{
+  struct timespec clock_rez;
+  gupcr_syscall (clock_getres, (GUPCR_CLOCK_REZ_ID, &clock_rez));
+  gupcr_assert (clock_rez.tv_sec == 0);
+  gupcr_clock_rez = clock_rez.tv_nsec * 1.0e-9;
+  gupcr_clock_base = gupcr_clock ();
+}
+
+#else /* Use gettimeofday().  */
+
+double
+gupcr_clock (void)
+{
+  struct timeval tv;
+  double t;
+  gupcr_syscall (gettimeofday, (&tv, NULL));
+  t = (double) tv.tv_sec + (double) tv.tv_usec * 1.0e-6;
+  t -= gupcr_clock_base;
+  return t;
+}
+
+double
+gupcr_clock_resolution (void)
+{
+  return gupcr_clock_rez;
+}
+
+void
+gupcr_clock_init (void)
+{
+  int i;
+  gupcr_clock_base = gupcr_clock ();
+  gupcr_clock_rez = 1.0;
+  for (i = 1; i <= 10; ++i)
+    {
+      double t1, t2, diff;
+      t1 = gupcr_clock ();
+      do
+	{
+	  t2 = gupcr_clock ();
+	}
+      while (t1 == t2);
+      diff = t2 - t1;
+      if (diff < gupcr_clock_rez)
+	gupcr_clock_rez = diff;
+    }
+  /* Round the clock resolution to some common values
+     if it is within range of one of them.  */
+  if (gupcr_clock_rez > 0.9e-6 && gupcr_clock_rez < 1.1e-6)
+    gupcr_clock_rez = 1.0e-6;
+  else if (gupcr_clock_rez > 0.9e-3 && gupcr_clock_rez < 1.1e-3)
+    gupcr_clock_rez = 1.0e-3;
+  else if (gupcr_clock_rez > 0.9e-2 && gupcr_clock_rez < 1.1e-2)
+    gupcr_clock_rez = 1.0e-2;
+  else if (gupcr_clock_rez > 1.63e-2 && gupcr_clock_rez < 1.69e-2)
+    gupcr_clock_rez = 1.0 / 60.0;
+  else if (gupcr_clock_rez > 1.95e-2 && gupcr_clock_rez < 2.05e-2)
+    gupcr_clock_rez = 1.0 / 50.0;
+}
+
+#endif
+/** @} */
Index: libgupc/portals4/gupcr_coll_broadcast.upc
===================================================================
--- libgupc/portals4/gupcr_coll_broadcast.upc	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_coll_broadcast.upc	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,129 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include <upc.h>
+#include <upc_collective.h>
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_coll_sup.h"
+
+/**
+ * @file gupcr_coll_broadcast.upc
+ * GUPC Portals4 collectives broadcast implementation.
+ *
+ * @addtogroup COLLECTIVES GUPCR Collectives Functions
+ * @{
+ */
+
+/**
+ * @fn upc_all_broadcast (shared void *dst,
+ *		   shared const void *src,
+ *		   size_t nbytes, upc_flag_t sync_mode)
+ * Broadcast data referenced by the src pointer.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] nbytes Number of bytes to broadcast
+ * @param [in] sync_mode Synchronization mode
+ * @ingroup COLLECTIVES
+ */
+
+void
+upc_all_broadcast (shared void *dst, shared const void *src,
+		   size_t nbytes, upc_flag_t sync_mode)
+{
+  size_t src_thread = upc_threadof ((shared void *) src);
+  size_t send_cnt = nbytes;
+  int i, blk_cnt;
+
+  gupcr_trace (FC_COLL, "COLL ALL_BROADCAST ENTER %lu %lu",
+	       (long unsigned) src_thread, (long unsigned) nbytes);
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, nbytes, sync_mode, 0, 0, 0, UPC_BRDCST);
+#endif
+
+  /* Initialize the collectives broadcast tree.  */
+  gupcr_coll_tree_setup (src_thread, 0, THREADS);
+
+  /* Optional IN synchronization mode.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  blk_cnt = 0;
+  while (send_cnt)
+    {
+      size_t blk_size = (send_cnt > GUPCR_PORTALS_MAX_MSG_SIZE) ?
+	GUPCR_PORTALS_MAX_MSG_SIZE : send_cnt;
+      send_cnt -= blk_size;
+
+      if (MYTHREAD != (int) src_thread)
+	{
+	  /* Wait for parent to deliver data.  */
+	  gupcr_coll_signal_wait (1);
+	}
+      else
+	{
+	  /* Copy data into the thread's own memory.  */
+	  size_t doffset = upc_addrfield ((shared char *) dst + MYTHREAD);
+	  size_t soffset = upc_addrfield ((shared void *) src);
+	  doffset += blk_cnt * GUPCR_PORTALS_MAX_MSG_SIZE;
+	  soffset += blk_cnt * GUPCR_PORTALS_MAX_MSG_SIZE;
+	  gupcr_debug (FC_COLL,
+		       "Local copy - doffset: %lld soffset: %lld nbytes: %lld",
+		       (long long int) doffset, (long long int) soffset,
+		       (long long int) nbytes);
+	  memcpy ((char *) gupcr_gmem_base + doffset,
+		  (char *) gupcr_gmem_base + soffset, blk_size);
+	}
+
+      /* Send data to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      int dthread = gupcr_coll_child[i];
+	      size_t doffset = upc_addrfield ((shared char *) dst + dthread);
+	      size_t soffset = upc_addrfield ((shared char *) dst + MYTHREAD);
+	      doffset += blk_cnt * GUPCR_PORTALS_MAX_MSG_SIZE;
+	      soffset += blk_cnt * GUPCR_PORTALS_MAX_MSG_SIZE;
+	      gupcr_coll_put (dthread, doffset, soffset, blk_size);
+	    }
+	  gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+	}
+      ++blk_cnt;
+    }
+
+  /* Optional OUT synchronization mode.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+  gupcr_trace (FC_COLL, "COLL ALL_BROADCAST EXIT");
+}
+
+/* @} */
Index: libgupc/portals4/gupcr_coll_init.upc
===================================================================
--- libgupc/portals4/gupcr_coll_init.upc	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_coll_init.upc	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,67 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include <upc.h>
+#include <upc_collective.h>
+#include <upc_coll.h>
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_coll_sup.h"
+
+/**
+ * @file gupcr_coll_init.upc
+ * GUPC Portals4 collectives initialization.
+ *
+ * @addtogroup COLLECTIVES GUPCR Collectives Functions
+ * @{
+ */
+int upc_coll_init_flag = 0;
+
+/**
+ * Collectives initialization function.
+ *
+ * Initialize necessary storage area for the broadcast/reduce
+ * thread trees.
+ */
+void
+upc_coll_init ()
+{
+  if (upc_coll_init_flag)
+    gupcr_fatal_error ("multiple attempts to initialize collectives");
+  upc_coll_init_flag = 1;
+
+  /* Allocate the "all reduce" storage area.  */
+  gupcr_reduce_storage = (gupcr_reduce_str_t)
+    upc_all_alloc (THREADS, sizeof (struct gupcr_reduce_str));
+  if (gupcr_reduce_storage == NULL)
+    gupcr_fatal_error ("cannot allocate collectives reduce shared storage");
+}
+
+/* @} */
Index: libgupc/portals4/gupcr_coll_reduce.upc
===================================================================
--- libgupc/portals4/gupcr_coll_reduce.upc	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_coll_reduce.upc	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,4978 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Copyright (c) 2004, Michigan Technological University                    */
+/*  All rights reserved.                                                     */
+/*                                                                           */
+/*  Redistribution and use in source and binary forms, with or without       */
+/*  modification, are permitted provided that the following conditions       */
+/*  are met:                                                                 */
+/*                                                                           */
+/*  * Redistributions of source code must retain the above copyright         */
+/*  notice, this list of conditions and the following disclaimer.            */
+/*  * Redistributions in binary form must reproduce the above                */
+/*  copyright notice, this list of conditions and the following              */
+/*  disclaimer in the documentation and/or other materials provided          */
+/*  with the distribution.                                                   */
+/*  * Neither the name of the Michigan Technological University              */
+/*  nor the names of its contributors may be used to endorse or promote      */
+/*  products derived from this software without specific prior written       */
+/*  permission.                                                              */
+/*                                                                           */
+/*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS      */
+/*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT        */
+/*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A  */
+/*  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER */
+/*  OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
+/*  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,      */
+/*  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR       */
+/*  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF   */
+/*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING     */
+/*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS       */
+/*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.             */
+/*                                                                           */
+/*****************************************************************************/
+
+#include <stdlib.h>
+#include <upc.h>
+#include <upc_collective.h>
+#include <upc_coll.h>
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_coll_sup.h"
+
+/*****************************************************************************/
+/*                                                                           */
+/*        UPC collective function library, reference implementation          */
+/*                                                                           */
+/*   Steve Seidel, Dept. of Computer Science, Michigan Technological Univ.   */
+/*   steve@mtu.edu                                        March 1, 2004      */
+/*                                                                           */
+/*****************************************************************************/
+
+/**
+ * @file gupcr_coll_reduce.upc
+ * GUPC Portals4 reduce collectives implementation.
+ *
+ * @addtogroup COLLECTIVES GUPCR Collectives Functions
+ * @{
+ */
+
+/** Collectives reduce storage pointer */
+gupcr_reduce_str_t gupcr_reduce_storage;
+
+/**
+ * Convert from UPC reduce to Portals atomic operation.
+ *
+ * @parm [in] op UPC reduce operation
+ * @retval Portals atomic operation
+*/
+ptl_op_t
+gupcr_portals_reduce_op (upc_op_t op)
+{
+  switch (op)
+    {
+    case UPC_ADD:
+      return PTL_SUM;
+    case UPC_MULT:
+      return PTL_PROD;
+    case UPC_AND:
+      return PTL_BAND;
+    case UPC_OR:
+      return PTL_BOR;
+    case UPC_XOR:
+      return PTL_BXOR;
+    case UPC_LOGAND:
+      return PTL_LAND;
+    case UPC_LOGOR:
+      return PTL_LOR;
+    case UPC_MIN:
+      return PTL_MIN;
+    case UPC_MAX:
+      return PTL_MAX;
+    default:
+      gupcr_fatal_error ("cannot convert UPC reduce operation 0x%lx.", op);
+    }
+}
+
+
+
+/**
+ * Collectives reduce (C) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ *   some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ *   for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ *   parent shared space. The reduced values are propagated to the
+ *   top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceC
+  (shared void *dst,
+   shared const void *src,
+   upc_op_t op,
+   size_t nelems,
+   size_t blk_size,
+   signed char (*func) (signed char, signed char), upc_flag_t sync_mode)
+{
+  int i, n_local, full_rows, last_row;
+  int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+  signed char local_result = 0;
+  signed char *l_src;
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER signed char %lu %lu",
+	       (long unsigned) nelems, (long unsigned) blk_size);
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  /* Compute n_local, the number of elements local to this thread.  */
+  n_local = 0;
+
+  /* Also compute start, the starting index of src for each thread.  */
+
+  src_thr = upc_threadof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+  ph = upc_phaseof ((shared void *) src);
+
+  /* nelems plus the number of virtual elements in first row.  */
+  velems = nelems + src_thr * blk_size + ph;
+
+  /* Include virtual elements when computing number of local elements.  */
+  full_rows = velems / (blk_size * THREADS);
+  last_row = velems % (blk_size * THREADS);
+  tail_thr = last_row / blk_size;
+
+  /* Calculate number of participating threads.  */
+  num_thr = (nelems + ph + blk_size - 1) / blk_size;
+  if (num_thr > THREADS)
+    num_thr = THREADS;
+
+  gupcr_debug (FC_COLL,
+	       "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+	       src_thr, tail_thr, ph, num_thr, full_rows);
+
+  /* Calculate number of local elements.  */
+  if (blk_size > 0)
+    {
+      if (MYTHREAD <= tail_thr)
+	if (MYTHREAD == tail_thr)
+	  extras = last_row % blk_size;
+	else
+	  extras = blk_size;
+      else
+	extras = 0;
+
+      n_local = blk_size * full_rows + extras;
+
+      /* Adjust the number of elements in this thread, if necessary.  */
+      if (MYTHREAD < src_thr)
+	n_local -= blk_size;
+      else if (MYTHREAD == src_thr)
+	n_local -= ph;
+    }
+  else
+    {
+      n_local = 0;
+      if (src_thr == MYTHREAD)	/* Revise the number of local elements.  */
+	n_local = nelems;
+    }
+
+  /* Starting index for this thread
+     Note: start is sometimes negative because src is
+     addressed here as if its block size is 1.  */
+
+  if (blk_size > 0)
+    if (MYTHREAD > src_thr)
+      start = MYTHREAD - src_thr - ph * THREADS;
+    else if (MYTHREAD < src_thr)
+      start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+    else			/* This is the source thread.  */
+      start = 0;
+  else
+    start = 0;
+
+
+  /* Reduce the elements local to this thread.  */
+
+  if (n_local > 0)
+    {
+      int loop_cnt = n_local - 1;
+
+      l_src = (signed char *) ((shared const signed char *) src + start);
+      local_result = *l_src++;
+
+      switch (op)
+	{
+	case UPC_ADD:
+	  while (loop_cnt--)
+	    local_result += *l_src++;
+	  break;
+	case UPC_MULT:
+	  while (loop_cnt--)
+	    local_result *= *l_src++;
+	  break;
+	  /* Skip if not integral type, per spec 4.3.1.1
+	     (See additional comments in upc_collective.c) */
+	case UPC_AND:
+	  while (loop_cnt--)
+	    local_result &= *l_src++;
+	  break;
+	case UPC_OR:
+	  while (loop_cnt--)
+	    local_result |= *l_src++;
+	  break;
+	case UPC_XOR:
+	  while (loop_cnt--)
+	    local_result ^= *l_src++;
+	  break;
+	case UPC_LOGAND:
+	  while (loop_cnt--)
+	    local_result = local_result && *l_src++;
+	  break;
+	case UPC_LOGOR:
+	  while (loop_cnt--)
+	    local_result = local_result || *l_src++;
+	  break;
+	case UPC_MIN:
+	  while (loop_cnt--)
+	    {
+	      if (local_result > *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_MAX:
+	  while (loop_cnt--)
+	    {
+	      if (local_result < *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	case UPC_NONCOMM_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	default:
+	  gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+	}
+    }
+
+  /* Note: local_result is undefined if n_local == 0.
+     Note: Only a proper subset of threads have a meaningful local_result.
+     Note: dst might be a thread that does not have a local result.  */
+
+  /* Global reduce on only participating threads.  */
+  if (n_local)
+    {
+      /* Local pointer where reduced values are written too.  */
+      signed char *t_result =
+	(signed char *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+      /* Initialize collectives reduce tree.  */
+      gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+      /* Copy in local results into the area for reduce operation.
+         NOTE: Not needed for the case of collective functions. However,
+         this covers the case of only one thread.  */
+      *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions.  */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+      /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+         to use triggered operations on inner nodes. In that case, inner
+         nodes must calculate reduced value by calling the specified
+         function.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  if (IS_ROOT_THREAD)
+	    {
+	      /* ROOT THREAD */
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for children to report their values.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+	      /* Reduce local values with those of children if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result =
+			func (local_result, *(signed char *)
+			      & gupcr_reduce_storage[MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		}
+	    }
+	  else
+	    {
+	      /* INNER THREAD */
+	      /* Prepare triggered atomic function.  */
+	      if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+		{
+		  /* Use triggered atomic operations once children sent
+		     their results and parent is ready to receive it.  */
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (signed char),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_CHAR,
+					     gupcr_coll_child_cnt + 1);
+		}
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for completion, children and parent are ready.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+	      /* Execute reduce functions if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  size_t doffset =
+		    upc_addrfield ((shared void *)
+				   &(gupcr_reduce_storage[MYTHREAD].value
+				     [gupcr_coll_child_index]));
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result = func (local_result, *(signed char *)
+					   &
+					   gupcr_reduce_storage
+					   [MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		  gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+				  sizeof (signed char));
+		}
+	      /* Wait for our value to go up the tree.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+      else
+	{
+	  /* Avoid the case where only one thread is available.  */
+	  if (!IS_ROOT_THREAD)
+	    {
+	      /* LEAF THREAD */
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      switch (op)
+		{
+		case UPC_FUNC:
+		case UPC_NONCOMM_FUNC:
+		  {
+		    /* Schedule a triggered put once signal is received.  */
+		    size_t doffset = upc_addrfield ((shared void *)
+						    &(gupcr_reduce_storage
+						      [MYTHREAD].
+						      value
+						      [gupcr_coll_child_index]));
+		    gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+					offset, sizeof (signed char), 1);
+		  }
+		  break;
+		default:
+		  /* Schedule a triggered atomic put once parent is ready.  */
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (signed char),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_CHAR, 1);
+		  break;
+		}
+	      /* Wait for parent to be ready.  */
+	      gupcr_coll_signal_wait (1);
+	      /* Wait for our value to leave.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+#else /* NO TRIGGERED OPS */
+      /* Send signal to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  /* ROOT OR INNER THREAD */
+	  int wait_cnt = gupcr_coll_child_cnt;
+
+	  /* Signal that parent is ready to receive the locally reduced
+	     values from its children. Value that we send does not matter.  */
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].signal));
+	      gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+	    }
+	  gupcr_coll_ack_wait (wait_cnt);
+
+	  /* Wait for children to report their local reduced values and
+	     parent to report it is ready to receive the reduced value.  */
+	  if (!IS_ROOT_THREAD)
+	    ++wait_cnt;
+	  gupcr_coll_signal_wait (wait_cnt);
+
+	  /* Compute result if reduce functions are used.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  local_result = func (local_result,
+				       *(signed char *) &
+				       gupcr_reduce_storage[MYTHREAD].value
+				       [i]);
+		}
+	      /* Prepare reduced value for going up the tree.  */
+	      *t_result = local_result;
+	    }
+	}
+      else if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF THREAD */
+	  gupcr_coll_signal_wait (1);
+	}
+
+      /* Send reduced value to the parent.  */
+      if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF OR INNER THREAD */
+	  /* Each child places its result into the parent memory slot
+	     dedicated for the child. The parent is responsible
+	     for creating the reduced result for itself and its
+	     children.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      size_t doffset = upc_addrfield ((shared void *)
+					      &(gupcr_reduce_storage
+						[MYTHREAD].value
+						[gupcr_coll_child_index]));
+	      size_t soffset =
+		upc_addrfield ((shared void *)
+			       &(gupcr_reduce_storage[MYTHREAD].value[0]));
+	      gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+			      sizeof (signed char));
+	    }
+	  else
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+				     sizeof (signed char),
+				     gupcr_portals_reduce_op (op),
+				     UPC_COLL_TO_PTL_CHAR);
+	    }
+	  gupcr_coll_ack_wait (1);
+	}
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+      /* Copy result into the caller's specified destination.  */
+      if (IS_ROOT_THREAD)
+	{
+	  *(shared signed char *) dst = *t_result;
+	}
+    }
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (UC) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ *   some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ *   for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ *   parent shared space. The reduced values are propagated to the
+ *   top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceUC
+  (shared void *dst,
+   shared const void *src,
+   upc_op_t op,
+   size_t nelems,
+   size_t blk_size,
+   unsigned char (*func) (unsigned char, unsigned char), upc_flag_t sync_mode)
+{
+  int i, n_local, full_rows, last_row;
+  int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+  unsigned char local_result = 0;
+  unsigned char *l_src;
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER unsigned char %lu %lu",
+	       (long unsigned) nelems, (long unsigned) blk_size);
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  /* Compute n_local, the number of elements local to this thread.  */
+  n_local = 0;
+
+  /* Also compute start, the starting index of src for each thread.  */
+
+  src_thr = upc_threadof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+  ph = upc_phaseof ((shared void *) src);
+
+  /* nelems plus the number of virtual elements in first row.  */
+  velems = nelems + src_thr * blk_size + ph;
+
+  /* Include virtual elements when computing number of local elements.  */
+  full_rows = velems / (blk_size * THREADS);
+  last_row = velems % (blk_size * THREADS);
+  tail_thr = last_row / blk_size;
+
+  /* Calculate number of participating threads.  */
+  num_thr = (nelems + ph + blk_size - 1) / blk_size;
+  if (num_thr > THREADS)
+    num_thr = THREADS;
+
+  gupcr_debug (FC_COLL,
+	       "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+	       src_thr, tail_thr, ph, num_thr, full_rows);
+
+  /* Calculate number of local elements.  */
+  if (blk_size > 0)
+    {
+      if (MYTHREAD <= tail_thr)
+	if (MYTHREAD == tail_thr)
+	  extras = last_row % blk_size;
+	else
+	  extras = blk_size;
+      else
+	extras = 0;
+
+      n_local = blk_size * full_rows + extras;
+
+      /* Adjust the number of elements in this thread, if necessary.  */
+      if (MYTHREAD < src_thr)
+	n_local -= blk_size;
+      else if (MYTHREAD == src_thr)
+	n_local -= ph;
+    }
+  else
+    {
+      n_local = 0;
+      if (src_thr == MYTHREAD)	/* Revise the number of local elements.  */
+	n_local = nelems;
+    }
+
+  /* Starting index for this thread
+     Note: start is sometimes negative because src is
+     addressed here as if its block size is 1.  */
+
+  if (blk_size > 0)
+    if (MYTHREAD > src_thr)
+      start = MYTHREAD - src_thr - ph * THREADS;
+    else if (MYTHREAD < src_thr)
+      start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+    else			/* This is the source thread.  */
+      start = 0;
+  else
+    start = 0;
+
+
+  /* Reduce the elements local to this thread.  */
+
+  if (n_local > 0)
+    {
+      int loop_cnt = n_local - 1;
+
+      l_src = (unsigned char *) ((shared const unsigned char *) src + start);
+      local_result = *l_src++;
+
+      switch (op)
+	{
+	case UPC_ADD:
+	  while (loop_cnt--)
+	    local_result += *l_src++;
+	  break;
+	case UPC_MULT:
+	  while (loop_cnt--)
+	    local_result *= *l_src++;
+	  break;
+	  /* Skip if not integral type, per spec 4.3.1.1
+	     (See additional comments in upc_collective.c) */
+	case UPC_AND:
+	  while (loop_cnt--)
+	    local_result &= *l_src++;
+	  break;
+	case UPC_OR:
+	  while (loop_cnt--)
+	    local_result |= *l_src++;
+	  break;
+	case UPC_XOR:
+	  while (loop_cnt--)
+	    local_result ^= *l_src++;
+	  break;
+	case UPC_LOGAND:
+	  while (loop_cnt--)
+	    local_result = local_result && *l_src++;
+	  break;
+	case UPC_LOGOR:
+	  while (loop_cnt--)
+	    local_result = local_result || *l_src++;
+	  break;
+	case UPC_MIN:
+	  while (loop_cnt--)
+	    {
+	      if (local_result > *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_MAX:
+	  while (loop_cnt--)
+	    {
+	      if (local_result < *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	case UPC_NONCOMM_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	default:
+	  gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+	}
+    }
+
+  /* Note: local_result is undefined if n_local == 0.
+     Note: Only a proper subset of threads have a meaningful local_result.
+     Note: dst might be a thread that does not have a local result.  */
+
+  /* Global reduce on only participating threads.  */
+  if (n_local)
+    {
+      /* Local pointer where reduced values are written too.  */
+      unsigned char *t_result =
+	(unsigned char *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+      /* Initialize collectives reduce tree.  */
+      gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+      /* Copy in local results into the area for reduce operation.
+         NOTE: Not needed for the case of collective functions. However,
+         this covers the case of only one thread.  */
+      *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions.  */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+      /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+         to use triggered operations on inner nodes. In that case, inner
+         nodes must calculate reduced value by calling the specified
+         function.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  if (IS_ROOT_THREAD)
+	    {
+	      /* ROOT THREAD */
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for children to report their values.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+	      /* Reduce local values with those of children if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result =
+			func (local_result, *(unsigned char *)
+			      & gupcr_reduce_storage[MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		}
+	    }
+	  else
+	    {
+	      /* INNER THREAD */
+	      /* Prepare triggered atomic function.  */
+	      if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+		{
+		  /* Use triggered atomic operations once children sent
+		     their results and parent is ready to receive it.  */
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (unsigned char),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_UCHAR,
+					     gupcr_coll_child_cnt + 1);
+		}
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for completion, children and parent are ready.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+	      /* Execute reduce functions if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  size_t doffset =
+		    upc_addrfield ((shared void *)
+				   &(gupcr_reduce_storage[MYTHREAD].value
+				     [gupcr_coll_child_index]));
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result = func (local_result, *(unsigned char *)
+					   &
+					   gupcr_reduce_storage
+					   [MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		  gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+				  sizeof (unsigned char));
+		}
+	      /* Wait for our value to go up the tree.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+      else
+	{
+	  /* Avoid the case where only one thread is available.  */
+	  if (!IS_ROOT_THREAD)
+	    {
+	      /* LEAF THREAD */
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      switch (op)
+		{
+		case UPC_FUNC:
+		case UPC_NONCOMM_FUNC:
+		  {
+		    /* Schedule a triggered put once signal is received.  */
+		    size_t doffset = upc_addrfield ((shared void *)
+						    &(gupcr_reduce_storage
+						      [MYTHREAD].
+						      value
+						      [gupcr_coll_child_index]));
+		    gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+					offset, sizeof (unsigned char), 1);
+		  }
+		  break;
+		default:
+		  /* Schedule a triggered atomic put once parent is ready.  */
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (unsigned char),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_UCHAR, 1);
+		  break;
+		}
+	      /* Wait for parent to be ready.  */
+	      gupcr_coll_signal_wait (1);
+	      /* Wait for our value to leave.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+#else /* NO TRIGGERED OPS */
+      /* Send signal to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  /* ROOT OR INNER THREAD */
+	  int wait_cnt = gupcr_coll_child_cnt;
+
+	  /* Signal that parent is ready to receive the locally reduced
+	     values from its children. Value that we send does not matter.  */
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].signal));
+	      gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+	    }
+	  gupcr_coll_ack_wait (wait_cnt);
+
+	  /* Wait for children to report their local reduced values and
+	     parent to report it is ready to receive the reduced value.  */
+	  if (!IS_ROOT_THREAD)
+	    ++wait_cnt;
+	  gupcr_coll_signal_wait (wait_cnt);
+
+	  /* Compute result if reduce functions are used.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  local_result = func (local_result,
+				       *(unsigned char *) &
+				       gupcr_reduce_storage[MYTHREAD].value
+				       [i]);
+		}
+	      /* Prepare reduced value for going up the tree.  */
+	      *t_result = local_result;
+	    }
+	}
+      else if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF THREAD */
+	  gupcr_coll_signal_wait (1);
+	}
+
+      /* Send reduced value to the parent.  */
+      if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF OR INNER THREAD */
+	  /* Each child places its result into the parent memory slot
+	     dedicated for the child. The parent is responsible
+	     for creating the reduced result for itself and its
+	     children.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      size_t doffset = upc_addrfield ((shared void *)
+					      &(gupcr_reduce_storage
+						[MYTHREAD].value
+						[gupcr_coll_child_index]));
+	      size_t soffset =
+		upc_addrfield ((shared void *)
+			       &(gupcr_reduce_storage[MYTHREAD].value[0]));
+	      gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+			      sizeof (unsigned char));
+	    }
+	  else
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+				     sizeof (unsigned char),
+				     gupcr_portals_reduce_op (op),
+				     UPC_COLL_TO_PTL_UCHAR);
+	    }
+	  gupcr_coll_ack_wait (1);
+	}
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+      /* Copy result into the caller's specified destination.  */
+      if (IS_ROOT_THREAD)
+	{
+	  *(shared unsigned char *) dst = *t_result;
+	}
+    }
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (S) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ *   some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ *   for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ *   parent shared space. The reduced values are propagated to the
+ *   top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceS
+  (shared void *dst,
+   shared const void *src,
+   upc_op_t op,
+   size_t nelems,
+   size_t blk_size,
+   signed short (*func) (signed short, signed short), upc_flag_t sync_mode)
+{
+  int i, n_local, full_rows, last_row;
+  int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+  signed short local_result = 0;
+  signed short *l_src;
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER signed short %lu %lu",
+	       (long unsigned) nelems, (long unsigned) blk_size);
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  /* Compute n_local, the number of elements local to this thread.  */
+  n_local = 0;
+
+  /* Also compute start, the starting index of src for each thread.  */
+
+  src_thr = upc_threadof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+  ph = upc_phaseof ((shared void *) src);
+
+  /* nelems plus the number of virtual elements in first row.  */
+  velems = nelems + src_thr * blk_size + ph;
+
+  /* Include virtual elements when computing number of local elements.  */
+  full_rows = velems / (blk_size * THREADS);
+  last_row = velems % (blk_size * THREADS);
+  tail_thr = last_row / blk_size;
+
+  /* Calculate number of participating threads.  */
+  num_thr = (nelems + ph + blk_size - 1) / blk_size;
+  if (num_thr > THREADS)
+    num_thr = THREADS;
+
+  gupcr_debug (FC_COLL,
+	       "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+	       src_thr, tail_thr, ph, num_thr, full_rows);
+
+  /* Calculate number of local elements.  */
+  if (blk_size > 0)
+    {
+      if (MYTHREAD <= tail_thr)
+	if (MYTHREAD == tail_thr)
+	  extras = last_row % blk_size;
+	else
+	  extras = blk_size;
+      else
+	extras = 0;
+
+      n_local = blk_size * full_rows + extras;
+
+      /* Adjust the number of elements in this thread, if necessary.  */
+      if (MYTHREAD < src_thr)
+	n_local -= blk_size;
+      else if (MYTHREAD == src_thr)
+	n_local -= ph;
+    }
+  else
+    {
+      n_local = 0;
+      if (src_thr == MYTHREAD)	/* Revise the number of local elements.  */
+	n_local = nelems;
+    }
+
+  /* Starting index for this thread
+     Note: start is sometimes negative because src is
+     addressed here as if its block size is 1.  */
+
+  if (blk_size > 0)
+    if (MYTHREAD > src_thr)
+      start = MYTHREAD - src_thr - ph * THREADS;
+    else if (MYTHREAD < src_thr)
+      start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+    else			/* This is the source thread.  */
+      start = 0;
+  else
+    start = 0;
+
+
+  /* Reduce the elements local to this thread.  */
+
+  if (n_local > 0)
+    {
+      int loop_cnt = n_local - 1;
+
+      l_src = (signed short *) ((shared const signed short *) src + start);
+      local_result = *l_src++;
+
+      switch (op)
+	{
+	case UPC_ADD:
+	  while (loop_cnt--)
+	    local_result += *l_src++;
+	  break;
+	case UPC_MULT:
+	  while (loop_cnt--)
+	    local_result *= *l_src++;
+	  break;
+	  /* Skip if not integral type, per spec 4.3.1.1
+	     (See additional comments in upc_collective.c) */
+	case UPC_AND:
+	  while (loop_cnt--)
+	    local_result &= *l_src++;
+	  break;
+	case UPC_OR:
+	  while (loop_cnt--)
+	    local_result |= *l_src++;
+	  break;
+	case UPC_XOR:
+	  while (loop_cnt--)
+	    local_result ^= *l_src++;
+	  break;
+	case UPC_LOGAND:
+	  while (loop_cnt--)
+	    local_result = local_result && *l_src++;
+	  break;
+	case UPC_LOGOR:
+	  while (loop_cnt--)
+	    local_result = local_result || *l_src++;
+	  break;
+	case UPC_MIN:
+	  while (loop_cnt--)
+	    {
+	      if (local_result > *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_MAX:
+	  while (loop_cnt--)
+	    {
+	      if (local_result < *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	case UPC_NONCOMM_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	default:
+	  gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+	}
+    }
+
+  /* Note: local_result is undefined if n_local == 0.
+     Note: Only a proper subset of threads have a meaningful local_result.
+     Note: dst might be a thread that does not have a local result.  */
+
+  /* Global reduce on only participating threads.  */
+  if (n_local)
+    {
+      /* Local pointer where reduced values are written too.  */
+      signed short *t_result =
+	(signed short *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+      /* Initialize collectives reduce tree.  */
+      gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+      /* Copy in local results into the area for reduce operation.
+         NOTE: Not needed for the case of collective functions. However,
+         this covers the case of only one thread.  */
+      *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions.  */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+      /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+         to use triggered operations on inner nodes. In that case, inner
+         nodes must calculate reduced value by calling the specified
+         function.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  if (IS_ROOT_THREAD)
+	    {
+	      /* ROOT THREAD */
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for children to report their values.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+	      /* Reduce local values with those of children if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result =
+			func (local_result, *(signed short *)
+			      & gupcr_reduce_storage[MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		}
+	    }
+	  else
+	    {
+	      /* INNER THREAD */
+	      /* Prepare triggered atomic function.  */
+	      if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+		{
+		  /* Use triggered atomic operations once children sent
+		     their results and parent is ready to receive it.  */
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (signed short),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_SHORT,
+					     gupcr_coll_child_cnt + 1);
+		}
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for completion, children and parent are ready.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+	      /* Execute reduce functions if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  size_t doffset =
+		    upc_addrfield ((shared void *)
+				   &(gupcr_reduce_storage[MYTHREAD].value
+				     [gupcr_coll_child_index]));
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result = func (local_result, *(signed short *)
+					   &
+					   gupcr_reduce_storage
+					   [MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		  gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+				  sizeof (signed short));
+		}
+	      /* Wait for our value to go up the tree.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+      else
+	{
+	  /* Avoid the case where only one thread is available.  */
+	  if (!IS_ROOT_THREAD)
+	    {
+	      /* LEAF THREAD */
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      switch (op)
+		{
+		case UPC_FUNC:
+		case UPC_NONCOMM_FUNC:
+		  {
+		    /* Schedule a triggered put once signal is received.  */
+		    size_t doffset = upc_addrfield ((shared void *)
+						    &(gupcr_reduce_storage
+						      [MYTHREAD].
+						      value
+						      [gupcr_coll_child_index]));
+		    gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+					offset, sizeof (signed short), 1);
+		  }
+		  break;
+		default:
+		  /* Schedule a triggered atomic put once parent is ready.  */
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (signed short),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_SHORT, 1);
+		  break;
+		}
+	      /* Wait for parent to be ready.  */
+	      gupcr_coll_signal_wait (1);
+	      /* Wait for our value to leave.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+#else /* NO TRIGGERED OPS */
+      /* Send signal to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  /* ROOT OR INNER THREAD */
+	  int wait_cnt = gupcr_coll_child_cnt;
+
+	  /* Signal that parent is ready to receive the locally reduced
+	     values from its children. Value that we send does not matter.  */
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].signal));
+	      gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+	    }
+	  gupcr_coll_ack_wait (wait_cnt);
+
+	  /* Wait for children to report their local reduced values and
+	     parent to report it is ready to receive the reduced value.  */
+	  if (!IS_ROOT_THREAD)
+	    ++wait_cnt;
+	  gupcr_coll_signal_wait (wait_cnt);
+
+	  /* Compute result if reduce functions are used.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  local_result = func (local_result,
+				       *(signed short *) &
+				       gupcr_reduce_storage[MYTHREAD].value
+				       [i]);
+		}
+	      /* Prepare reduced value for going up the tree.  */
+	      *t_result = local_result;
+	    }
+	}
+      else if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF THREAD */
+	  gupcr_coll_signal_wait (1);
+	}
+
+      /* Send reduced value to the parent.  */
+      if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF OR INNER THREAD */
+	  /* Each child places its result into the parent memory slot
+	     dedicated for the child. The parent is responsible
+	     for creating the reduced result for itself and its
+	     children.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      size_t doffset = upc_addrfield ((shared void *)
+					      &(gupcr_reduce_storage
+						[MYTHREAD].value
+						[gupcr_coll_child_index]));
+	      size_t soffset =
+		upc_addrfield ((shared void *)
+			       &(gupcr_reduce_storage[MYTHREAD].value[0]));
+	      gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+			      sizeof (signed short));
+	    }
+	  else
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+				     sizeof (signed short),
+				     gupcr_portals_reduce_op (op),
+				     UPC_COLL_TO_PTL_SHORT);
+	    }
+	  gupcr_coll_ack_wait (1);
+	}
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+      /* Copy result into the caller's specified destination.  */
+      if (IS_ROOT_THREAD)
+	{
+	  *(shared signed short *) dst = *t_result;
+	}
+    }
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (US) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ *   some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ *   for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ *   parent shared space. The reduced values are propagated to the
+ *   top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceUS
+  (shared void *dst,
+   shared const void *src,
+   upc_op_t op,
+   size_t nelems,
+   size_t blk_size,
+   unsigned short (*func) (unsigned short, unsigned short), upc_flag_t sync_mode)
+{
+  int i, n_local, full_rows, last_row;
+  int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+  unsigned short local_result = 0;
+  unsigned short *l_src;
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER unsigned short %lu %lu",
+	       (long unsigned) nelems, (long unsigned) blk_size);
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  /* Compute n_local, the number of elements local to this thread.  */
+  n_local = 0;
+
+  /* Also compute start, the starting index of src for each thread.  */
+
+  src_thr = upc_threadof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+  ph = upc_phaseof ((shared void *) src);
+
+  /* nelems plus the number of virtual elements in first row.  */
+  velems = nelems + src_thr * blk_size + ph;
+
+  /* Include virtual elements when computing number of local elements.  */
+  full_rows = velems / (blk_size * THREADS);
+  last_row = velems % (blk_size * THREADS);
+  tail_thr = last_row / blk_size;
+
+  /* Calculate number of participating threads.  */
+  num_thr = (nelems + ph + blk_size - 1) / blk_size;
+  if (num_thr > THREADS)
+    num_thr = THREADS;
+
+  gupcr_debug (FC_COLL,
+	       "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+	       src_thr, tail_thr, ph, num_thr, full_rows);
+
+  /* Calculate number of local elements.  */
+  if (blk_size > 0)
+    {
+      if (MYTHREAD <= tail_thr)
+	if (MYTHREAD == tail_thr)
+	  extras = last_row % blk_size;
+	else
+	  extras = blk_size;
+      else
+	extras = 0;
+
+      n_local = blk_size * full_rows + extras;
+
+      /* Adjust the number of elements in this thread, if necessary.  */
+      if (MYTHREAD < src_thr)
+	n_local -= blk_size;
+      else if (MYTHREAD == src_thr)
+	n_local -= ph;
+    }
+  else
+    {
+      n_local = 0;
+      if (src_thr == MYTHREAD)	/* Revise the number of local elements.  */
+	n_local = nelems;
+    }
+
+  /* Starting index for this thread
+     Note: start is sometimes negative because src is
+     addressed here as if its block size is 1.  */
+
+  if (blk_size > 0)
+    if (MYTHREAD > src_thr)
+      start = MYTHREAD - src_thr - ph * THREADS;
+    else if (MYTHREAD < src_thr)
+      start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+    else			/* This is the source thread.  */
+      start = 0;
+  else
+    start = 0;
+
+
+  /* Reduce the elements local to this thread.  */
+
+  if (n_local > 0)
+    {
+      int loop_cnt = n_local - 1;
+
+      l_src = (unsigned short *) ((shared const unsigned short *) src + start);
+      local_result = *l_src++;
+
+      switch (op)
+	{
+	case UPC_ADD:
+	  while (loop_cnt--)
+	    local_result += *l_src++;
+	  break;
+	case UPC_MULT:
+	  while (loop_cnt--)
+	    local_result *= *l_src++;
+	  break;
+	  /* Skip if not integral type, per spec 4.3.1.1
+	     (See additional comments in upc_collective.c) */
+	case UPC_AND:
+	  while (loop_cnt--)
+	    local_result &= *l_src++;
+	  break;
+	case UPC_OR:
+	  while (loop_cnt--)
+	    local_result |= *l_src++;
+	  break;
+	case UPC_XOR:
+	  while (loop_cnt--)
+	    local_result ^= *l_src++;
+	  break;
+	case UPC_LOGAND:
+	  while (loop_cnt--)
+	    local_result = local_result && *l_src++;
+	  break;
+	case UPC_LOGOR:
+	  while (loop_cnt--)
+	    local_result = local_result || *l_src++;
+	  break;
+	case UPC_MIN:
+	  while (loop_cnt--)
+	    {
+	      if (local_result > *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_MAX:
+	  while (loop_cnt--)
+	    {
+	      if (local_result < *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	case UPC_NONCOMM_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	default:
+	  gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+	}
+    }
+
+  /* Note: local_result is undefined if n_local == 0.
+     Note: Only a proper subset of threads have a meaningful local_result.
+     Note: dst might be a thread that does not have a local result.  */
+
+  /* Global reduce on only participating threads.  */
+  if (n_local)
+    {
+      /* Local pointer where reduced values are written too.  */
+      unsigned short *t_result =
+	(unsigned short *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+      /* Initialize collectives reduce tree.  */
+      gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+      /* Copy in local results into the area for reduce operation.
+         NOTE: Not needed for the case of collective functions. However,
+         this covers the case of only one thread.  */
+      *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions.  */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+      /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+         to use triggered operations on inner nodes. In that case, inner
+         nodes must calculate reduced value by calling the specified
+         function.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  if (IS_ROOT_THREAD)
+	    {
+	      /* ROOT THREAD */
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for children to report their values.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+	      /* Reduce local values with those of children if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result =
+			func (local_result, *(unsigned short *)
+			      & gupcr_reduce_storage[MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		}
+	    }
+	  else
+	    {
+	      /* INNER THREAD */
+	      /* Prepare triggered atomic function.  */
+	      if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+		{
+		  /* Use triggered atomic operations once children sent
+		     their results and parent is ready to receive it.  */
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (unsigned short),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_USHORT,
+					     gupcr_coll_child_cnt + 1);
+		}
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for completion, children and parent are ready.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+	      /* Execute reduce functions if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  size_t doffset =
+		    upc_addrfield ((shared void *)
+				   &(gupcr_reduce_storage[MYTHREAD].value
+				     [gupcr_coll_child_index]));
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result = func (local_result, *(unsigned short *)
+					   &
+					   gupcr_reduce_storage
+					   [MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		  gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+				  sizeof (unsigned short));
+		}
+	      /* Wait for our value to go up the tree.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+      else
+	{
+	  /* Avoid the case where only one thread is available.  */
+	  if (!IS_ROOT_THREAD)
+	    {
+	      /* LEAF THREAD */
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      switch (op)
+		{
+		case UPC_FUNC:
+		case UPC_NONCOMM_FUNC:
+		  {
+		    /* Schedule a triggered put once signal is received.  */
+		    size_t doffset = upc_addrfield ((shared void *)
+						    &(gupcr_reduce_storage
+						      [MYTHREAD].
+						      value
+						      [gupcr_coll_child_index]));
+		    gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+					offset, sizeof (unsigned short), 1);
+		  }
+		  break;
+		default:
+		  /* Schedule a triggered atomic put once parent is ready.  */
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (unsigned short),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_USHORT, 1);
+		  break;
+		}
+	      /* Wait for parent to be ready.  */
+	      gupcr_coll_signal_wait (1);
+	      /* Wait for our value to leave.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+#else /* NO TRIGGERED OPS */
+      /* Send signal to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  /* ROOT OR INNER THREAD */
+	  int wait_cnt = gupcr_coll_child_cnt;
+
+	  /* Signal that parent is ready to receive the locally reduced
+	     values from its children. Value that we send does not matter.  */
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].signal));
+	      gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+	    }
+	  gupcr_coll_ack_wait (wait_cnt);
+
+	  /* Wait for children to report their local reduced values and
+	     parent to report it is ready to receive the reduced value.  */
+	  if (!IS_ROOT_THREAD)
+	    ++wait_cnt;
+	  gupcr_coll_signal_wait (wait_cnt);
+
+	  /* Compute result if reduce functions are used.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  local_result = func (local_result,
+				       *(unsigned short *) &
+				       gupcr_reduce_storage[MYTHREAD].value
+				       [i]);
+		}
+	      /* Prepare reduced value for going up the tree.  */
+	      *t_result = local_result;
+	    }
+	}
+      else if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF THREAD */
+	  gupcr_coll_signal_wait (1);
+	}
+
+      /* Send reduced value to the parent.  */
+      if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF OR INNER THREAD */
+	  /* Each child places its result into the parent memory slot
+	     dedicated for the child. The parent is responsible
+	     for creating the reduced result for itself and its
+	     children.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      size_t doffset = upc_addrfield ((shared void *)
+					      &(gupcr_reduce_storage
+						[MYTHREAD].value
+						[gupcr_coll_child_index]));
+	      size_t soffset =
+		upc_addrfield ((shared void *)
+			       &(gupcr_reduce_storage[MYTHREAD].value[0]));
+	      gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+			      sizeof (unsigned short));
+	    }
+	  else
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+				     sizeof (unsigned short),
+				     gupcr_portals_reduce_op (op),
+				     UPC_COLL_TO_PTL_USHORT);
+	    }
+	  gupcr_coll_ack_wait (1);
+	}
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+      /* Copy result into the caller's specified destination.  */
+      if (IS_ROOT_THREAD)
+	{
+	  *(shared unsigned short *) dst = *t_result;
+	}
+    }
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (I) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ *   some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ *   for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ *   parent shared space. The reduced values are propagated to the
+ *   top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceI
+  (shared void *dst,
+   shared const void *src,
+   upc_op_t op,
+   size_t nelems,
+   size_t blk_size,
+   signed int (*func) (signed int, signed int), upc_flag_t sync_mode)
+{
+  int i, n_local, full_rows, last_row;
+  int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+  signed int local_result = 0;
+  signed int *l_src;
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER signed int %lu %lu",
+	       (long unsigned) nelems, (long unsigned) blk_size);
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  /* Compute n_local, the number of elements local to this thread.  */
+  n_local = 0;
+
+  /* Also compute start, the starting index of src for each thread.  */
+
+  src_thr = upc_threadof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+  ph = upc_phaseof ((shared void *) src);
+
+  /* nelems plus the number of virtual elements in first row.  */
+  velems = nelems + src_thr * blk_size + ph;
+
+  /* Include virtual elements when computing number of local elements.  */
+  full_rows = velems / (blk_size * THREADS);
+  last_row = velems % (blk_size * THREADS);
+  tail_thr = last_row / blk_size;
+
+  /* Calculate number of participating threads.  */
+  num_thr = (nelems + ph + blk_size - 1) / blk_size;
+  if (num_thr > THREADS)
+    num_thr = THREADS;
+
+  gupcr_debug (FC_COLL,
+	       "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+	       src_thr, tail_thr, ph, num_thr, full_rows);
+
+  /* Calculate number of local elements.  */
+  if (blk_size > 0)
+    {
+      if (MYTHREAD <= tail_thr)
+	if (MYTHREAD == tail_thr)
+	  extras = last_row % blk_size;
+	else
+	  extras = blk_size;
+      else
+	extras = 0;
+
+      n_local = blk_size * full_rows + extras;
+
+      /* Adjust the number of elements in this thread, if necessary.  */
+      if (MYTHREAD < src_thr)
+	n_local -= blk_size;
+      else if (MYTHREAD == src_thr)
+	n_local -= ph;
+    }
+  else
+    {
+      n_local = 0;
+      if (src_thr == MYTHREAD)	/* Revise the number of local elements.  */
+	n_local = nelems;
+    }
+
+  /* Starting index for this thread
+     Note: start is sometimes negative because src is
+     addressed here as if its block size is 1.  */
+
+  if (blk_size > 0)
+    if (MYTHREAD > src_thr)
+      start = MYTHREAD - src_thr - ph * THREADS;
+    else if (MYTHREAD < src_thr)
+      start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+    else			/* This is the source thread.  */
+      start = 0;
+  else
+    start = 0;
+
+
+  /* Reduce the elements local to this thread.  */
+
+  if (n_local > 0)
+    {
+      int loop_cnt = n_local - 1;
+
+      l_src = (signed int *) ((shared const signed int *) src + start);
+      local_result = *l_src++;
+
+      switch (op)
+	{
+	case UPC_ADD:
+	  while (loop_cnt--)
+	    local_result += *l_src++;
+	  break;
+	case UPC_MULT:
+	  while (loop_cnt--)
+	    local_result *= *l_src++;
+	  break;
+	  /* Skip if not integral type, per spec 4.3.1.1
+	     (See additional comments in upc_collective.c) */
+	case UPC_AND:
+	  while (loop_cnt--)
+	    local_result &= *l_src++;
+	  break;
+	case UPC_OR:
+	  while (loop_cnt--)
+	    local_result |= *l_src++;
+	  break;
+	case UPC_XOR:
+	  while (loop_cnt--)
+	    local_result ^= *l_src++;
+	  break;
+	case UPC_LOGAND:
+	  while (loop_cnt--)
+	    local_result = local_result && *l_src++;
+	  break;
+	case UPC_LOGOR:
+	  while (loop_cnt--)
+	    local_result = local_result || *l_src++;
+	  break;
+	case UPC_MIN:
+	  while (loop_cnt--)
+	    {
+	      if (local_result > *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_MAX:
+	  while (loop_cnt--)
+	    {
+	      if (local_result < *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	case UPC_NONCOMM_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	default:
+	  gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+	}
+    }
+
+  /* Note: local_result is undefined if n_local == 0.
+     Note: Only a proper subset of threads have a meaningful local_result.
+     Note: dst might be a thread that does not have a local result.  */
+
+  /* Global reduce on only participating threads.  */
+  if (n_local)
+    {
+      /* Local pointer where reduced values are written too.  */
+      signed int *t_result =
+	(signed int *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+      /* Initialize collectives reduce tree.  */
+      gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+      /* Copy in local results into the area for reduce operation.
+         NOTE: Not needed for the case of collective functions. However,
+         this covers the case of only one thread.  */
+      *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions.  */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+      /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+         to use triggered operations on inner nodes. In that case, inner
+         nodes must calculate reduced value by calling the specified
+         function.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  if (IS_ROOT_THREAD)
+	    {
+	      /* ROOT THREAD */
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for children to report their values.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+	      /* Reduce local values with those of children if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result =
+			func (local_result, *(signed int *)
+			      & gupcr_reduce_storage[MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		}
+	    }
+	  else
+	    {
+	      /* INNER THREAD */
+	      /* Prepare triggered atomic function.  */
+	      if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+		{
+		  /* Use triggered atomic operations once children sent
+		     their results and parent is ready to receive it.  */
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (signed int),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_INT,
+					     gupcr_coll_child_cnt + 1);
+		}
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for completion, children and parent are ready.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+	      /* Execute reduce functions if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  size_t doffset =
+		    upc_addrfield ((shared void *)
+				   &(gupcr_reduce_storage[MYTHREAD].value
+				     [gupcr_coll_child_index]));
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result = func (local_result, *(signed int *)
+					   &
+					   gupcr_reduce_storage
+					   [MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		  gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+				  sizeof (signed int));
+		}
+	      /* Wait for our value to go up the tree.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+      else
+	{
+	  /* Avoid the case where only one thread is available.  */
+	  if (!IS_ROOT_THREAD)
+	    {
+	      /* LEAF THREAD */
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      switch (op)
+		{
+		case UPC_FUNC:
+		case UPC_NONCOMM_FUNC:
+		  {
+		    /* Schedule a triggered put once signal is received.  */
+		    size_t doffset = upc_addrfield ((shared void *)
+						    &(gupcr_reduce_storage
+						      [MYTHREAD].
+						      value
+						      [gupcr_coll_child_index]));
+		    gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+					offset, sizeof (signed int), 1);
+		  }
+		  break;
+		default:
+		  /* Schedule a triggered atomic put once parent is ready.  */
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (signed int),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_INT, 1);
+		  break;
+		}
+	      /* Wait for parent to be ready.  */
+	      gupcr_coll_signal_wait (1);
+	      /* Wait for our value to leave.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+#else /* NO TRIGGERED OPS */
+      /* Send signal to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  /* ROOT OR INNER THREAD */
+	  int wait_cnt = gupcr_coll_child_cnt;
+
+	  /* Signal that parent is ready to receive the locally reduced
+	     values from its children. Value that we send does not matter.  */
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].signal));
+	      gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+	    }
+	  gupcr_coll_ack_wait (wait_cnt);
+
+	  /* Wait for children to report their local reduced values and
+	     parent to report it is ready to receive the reduced value.  */
+	  if (!IS_ROOT_THREAD)
+	    ++wait_cnt;
+	  gupcr_coll_signal_wait (wait_cnt);
+
+	  /* Compute result if reduce functions are used.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  local_result = func (local_result,
+				       *(signed int *) &
+				       gupcr_reduce_storage[MYTHREAD].value
+				       [i]);
+		}
+	      /* Prepare reduced value for going up the tree.  */
+	      *t_result = local_result;
+	    }
+	}
+      else if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF THREAD */
+	  gupcr_coll_signal_wait (1);
+	}
+
+      /* Send reduced value to the parent.  */
+      if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF OR INNER THREAD */
+	  /* Each child places its result into the parent memory slot
+	     dedicated for the child. The parent is responsible
+	     for creating the reduced result for itself and its
+	     children.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      size_t doffset = upc_addrfield ((shared void *)
+					      &(gupcr_reduce_storage
+						[MYTHREAD].value
+						[gupcr_coll_child_index]));
+	      size_t soffset =
+		upc_addrfield ((shared void *)
+			       &(gupcr_reduce_storage[MYTHREAD].value[0]));
+	      gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+			      sizeof (signed int));
+	    }
+	  else
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+				     sizeof (signed int),
+				     gupcr_portals_reduce_op (op),
+				     UPC_COLL_TO_PTL_INT);
+	    }
+	  gupcr_coll_ack_wait (1);
+	}
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+      /* Copy result into the caller's specified destination.  */
+      if (IS_ROOT_THREAD)
+	{
+	  *(shared signed int *) dst = *t_result;
+	}
+    }
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (UI) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ *   some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ *   for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ *   parent shared space. The reduced values are propagated to the
+ *   top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceUI
+  (shared void *dst,
+   shared const void *src,
+   upc_op_t op,
+   size_t nelems,
+   size_t blk_size,
+   unsigned int (*func) (unsigned int, unsigned int), upc_flag_t sync_mode)
+{
+  int i, n_local, full_rows, last_row;
+  int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+  unsigned int local_result = 0;
+  unsigned int *l_src;
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER unsigned int %lu %lu",
+	       (long unsigned) nelems, (long unsigned) blk_size);
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  /* Compute n_local, the number of elements local to this thread.  */
+  n_local = 0;
+
+  /* Also compute start, the starting index of src for each thread.  */
+
+  src_thr = upc_threadof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+  ph = upc_phaseof ((shared void *) src);
+
+  /* nelems plus the number of virtual elements in first row.  */
+  velems = nelems + src_thr * blk_size + ph;
+
+  /* Include virtual elements when computing number of local elements.  */
+  full_rows = velems / (blk_size * THREADS);
+  last_row = velems % (blk_size * THREADS);
+  tail_thr = last_row / blk_size;
+
+  /* Calculate number of participating threads.  */
+  num_thr = (nelems + ph + blk_size - 1) / blk_size;
+  if (num_thr > THREADS)
+    num_thr = THREADS;
+
+  gupcr_debug (FC_COLL,
+	       "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+	       src_thr, tail_thr, ph, num_thr, full_rows);
+
+  /* Calculate number of local elements.  */
+  if (blk_size > 0)
+    {
+      if (MYTHREAD <= tail_thr)
+	if (MYTHREAD == tail_thr)
+	  extras = last_row % blk_size;
+	else
+	  extras = blk_size;
+      else
+	extras = 0;
+
+      n_local = blk_size * full_rows + extras;
+
+      /* Adjust the number of elements in this thread, if necessary.  */
+      if (MYTHREAD < src_thr)
+	n_local -= blk_size;
+      else if (MYTHREAD == src_thr)
+	n_local -= ph;
+    }
+  else
+    {
+      n_local = 0;
+      if (src_thr == MYTHREAD)	/* Revise the number of local elements.  */
+	n_local = nelems;
+    }
+
+  /* Starting index for this thread
+     Note: start is sometimes negative because src is
+     addressed here as if its block size is 1.  */
+
+  if (blk_size > 0)
+    if (MYTHREAD > src_thr)
+      start = MYTHREAD - src_thr - ph * THREADS;
+    else if (MYTHREAD < src_thr)
+      start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+    else			/* This is the source thread.  */
+      start = 0;
+  else
+    start = 0;
+
+
+  /* Reduce the elements local to this thread.  */
+
+  if (n_local > 0)
+    {
+      int loop_cnt = n_local - 1;
+
+      l_src = (unsigned int *) ((shared const unsigned int *) src + start);
+      local_result = *l_src++;
+
+      switch (op)
+	{
+	case UPC_ADD:
+	  while (loop_cnt--)
+	    local_result += *l_src++;
+	  break;
+	case UPC_MULT:
+	  while (loop_cnt--)
+	    local_result *= *l_src++;
+	  break;
+	  /* Skip if not integral type, per spec 4.3.1.1
+	     (See additional comments in upc_collective.c) */
+	case UPC_AND:
+	  while (loop_cnt--)
+	    local_result &= *l_src++;
+	  break;
+	case UPC_OR:
+	  while (loop_cnt--)
+	    local_result |= *l_src++;
+	  break;
+	case UPC_XOR:
+	  while (loop_cnt--)
+	    local_result ^= *l_src++;
+	  break;
+	case UPC_LOGAND:
+	  while (loop_cnt--)
+	    local_result = local_result && *l_src++;
+	  break;
+	case UPC_LOGOR:
+	  while (loop_cnt--)
+	    local_result = local_result || *l_src++;
+	  break;
+	case UPC_MIN:
+	  while (loop_cnt--)
+	    {
+	      if (local_result > *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_MAX:
+	  while (loop_cnt--)
+	    {
+	      if (local_result < *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	case UPC_NONCOMM_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	default:
+	  gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+	}
+    }
+
+  /* Note: local_result is undefined if n_local == 0.
+     Note: Only a proper subset of threads have a meaningful local_result.
+     Note: dst might be a thread that does not have a local result.  */
+
+  /* Global reduce on only participating threads.  */
+  if (n_local)
+    {
+      /* Local pointer where reduced values are written too.  */
+      unsigned int *t_result =
+	(unsigned int *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+      /* Initialize collectives reduce tree.  */
+      gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+      /* Copy in local results into the area for reduce operation.
+         NOTE: Not needed for the case of collective functions. However,
+         this covers the case of only one thread.  */
+      *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions.  */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+      /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+         to use triggered operations on inner nodes. In that case, inner
+         nodes must calculate reduced value by calling the specified
+         function.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  if (IS_ROOT_THREAD)
+	    {
+	      /* ROOT THREAD */
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for children to report their values.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+	      /* Reduce local values with those of children if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result =
+			func (local_result, *(unsigned int *)
+			      & gupcr_reduce_storage[MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		}
+	    }
+	  else
+	    {
+	      /* INNER THREAD */
+	      /* Prepare triggered atomic function.  */
+	      if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+		{
+		  /* Use triggered atomic operations once children sent
+		     their results and parent is ready to receive it.  */
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (unsigned int),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_UINT,
+					     gupcr_coll_child_cnt + 1);
+		}
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for completion, children and parent are ready.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+	      /* Execute reduce functions if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  size_t doffset =
+		    upc_addrfield ((shared void *)
+				   &(gupcr_reduce_storage[MYTHREAD].value
+				     [gupcr_coll_child_index]));
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result = func (local_result, *(unsigned int *)
+					   &
+					   gupcr_reduce_storage
+					   [MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		  gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+				  sizeof (unsigned int));
+		}
+	      /* Wait for our value to go up the tree.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+      else
+	{
+	  /* Avoid the case where only one thread is available.  */
+	  if (!IS_ROOT_THREAD)
+	    {
+	      /* LEAF THREAD */
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      switch (op)
+		{
+		case UPC_FUNC:
+		case UPC_NONCOMM_FUNC:
+		  {
+		    /* Schedule a triggered put once signal is received.  */
+		    size_t doffset = upc_addrfield ((shared void *)
+						    &(gupcr_reduce_storage
+						      [MYTHREAD].
+						      value
+						      [gupcr_coll_child_index]));
+		    gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+					offset, sizeof (unsigned int), 1);
+		  }
+		  break;
+		default:
+		  /* Schedule a triggered atomic put once parent is ready.  */
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (unsigned int),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_UINT, 1);
+		  break;
+		}
+	      /* Wait for parent to be ready.  */
+	      gupcr_coll_signal_wait (1);
+	      /* Wait for our value to leave.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+#else /* NO TRIGGERED OPS */
+      /* Send signal to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  /* ROOT OR INNER THREAD */
+	  int wait_cnt = gupcr_coll_child_cnt;
+
+	  /* Signal that parent is ready to receive the locally reduced
+	     values from its children. Value that we send does not matter.  */
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].signal));
+	      gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+	    }
+	  gupcr_coll_ack_wait (wait_cnt);
+
+	  /* Wait for children to report their local reduced values and
+	     parent to report it is ready to receive the reduced value.  */
+	  if (!IS_ROOT_THREAD)
+	    ++wait_cnt;
+	  gupcr_coll_signal_wait (wait_cnt);
+
+	  /* Compute result if reduce functions are used.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  local_result = func (local_result,
+				       *(unsigned int *) &
+				       gupcr_reduce_storage[MYTHREAD].value
+				       [i]);
+		}
+	      /* Prepare reduced value for going up the tree.  */
+	      *t_result = local_result;
+	    }
+	}
+      else if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF THREAD */
+	  gupcr_coll_signal_wait (1);
+	}
+
+      /* Send reduced value to the parent.  */
+      if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF OR INNER THREAD */
+	  /* Each child places its result into the parent memory slot
+	     dedicated for the child. The parent is responsible
+	     for creating the reduced result for itself and its
+	     children.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      size_t doffset = upc_addrfield ((shared void *)
+					      &(gupcr_reduce_storage
+						[MYTHREAD].value
+						[gupcr_coll_child_index]));
+	      size_t soffset =
+		upc_addrfield ((shared void *)
+			       &(gupcr_reduce_storage[MYTHREAD].value[0]));
+	      gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+			      sizeof (unsigned int));
+	    }
+	  else
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+				     sizeof (unsigned int),
+				     gupcr_portals_reduce_op (op),
+				     UPC_COLL_TO_PTL_UINT);
+	    }
+	  gupcr_coll_ack_wait (1);
+	}
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+      /* Copy result into the caller's specified destination.  */
+      if (IS_ROOT_THREAD)
+	{
+	  *(shared unsigned int *) dst = *t_result;
+	}
+    }
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (L) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ *   some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ *   for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ *   parent shared space. The reduced values are propagated to the
+ *   top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceL
+  (shared void *dst,
+   shared const void *src,
+   upc_op_t op,
+   size_t nelems,
+   size_t blk_size,
+   signed long (*func) (signed long, signed long), upc_flag_t sync_mode)
+{
+  int i, n_local, full_rows, last_row;
+  int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+  signed long local_result = 0;
+  signed long *l_src;
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER signed long %lu %lu",
+	       (long unsigned) nelems, (long unsigned) blk_size);
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  /* Compute n_local, the number of elements local to this thread.  */
+  n_local = 0;
+
+  /* Also compute start, the starting index of src for each thread.  */
+
+  src_thr = upc_threadof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+  ph = upc_phaseof ((shared void *) src);
+
+  /* nelems plus the number of virtual elements in first row.  */
+  velems = nelems + src_thr * blk_size + ph;
+
+  /* Include virtual elements when computing number of local elements.  */
+  full_rows = velems / (blk_size * THREADS);
+  last_row = velems % (blk_size * THREADS);
+  tail_thr = last_row / blk_size;
+
+  /* Calculate number of participating threads.  */
+  num_thr = (nelems + ph + blk_size - 1) / blk_size;
+  if (num_thr > THREADS)
+    num_thr = THREADS;
+
+  gupcr_debug (FC_COLL,
+	       "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+	       src_thr, tail_thr, ph, num_thr, full_rows);
+
+  /* Calculate number of local elements.  */
+  if (blk_size > 0)
+    {
+      if (MYTHREAD <= tail_thr)
+	if (MYTHREAD == tail_thr)
+	  extras = last_row % blk_size;
+	else
+	  extras = blk_size;
+      else
+	extras = 0;
+
+      n_local = blk_size * full_rows + extras;
+
+      /* Adjust the number of elements in this thread, if necessary.  */
+      if (MYTHREAD < src_thr)
+	n_local -= blk_size;
+      else if (MYTHREAD == src_thr)
+	n_local -= ph;
+    }
+  else
+    {
+      n_local = 0;
+      if (src_thr == MYTHREAD)	/* Revise the number of local elements.  */
+	n_local = nelems;
+    }
+
+  /* Starting index for this thread
+     Note: start is sometimes negative because src is
+     addressed here as if its block size is 1.  */
+
+  if (blk_size > 0)
+    if (MYTHREAD > src_thr)
+      start = MYTHREAD - src_thr - ph * THREADS;
+    else if (MYTHREAD < src_thr)
+      start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+    else			/* This is the source thread.  */
+      start = 0;
+  else
+    start = 0;
+
+
+  /* Reduce the elements local to this thread.  */
+
+  if (n_local > 0)
+    {
+      int loop_cnt = n_local - 1;
+
+      l_src = (signed long *) ((shared const signed long *) src + start);
+      local_result = *l_src++;
+
+      switch (op)
+	{
+	case UPC_ADD:
+	  while (loop_cnt--)
+	    local_result += *l_src++;
+	  break;
+	case UPC_MULT:
+	  while (loop_cnt--)
+	    local_result *= *l_src++;
+	  break;
+	  /* Skip if not integral type, per spec 4.3.1.1
+	     (See additional comments in upc_collective.c) */
+	case UPC_AND:
+	  while (loop_cnt--)
+	    local_result &= *l_src++;
+	  break;
+	case UPC_OR:
+	  while (loop_cnt--)
+	    local_result |= *l_src++;
+	  break;
+	case UPC_XOR:
+	  while (loop_cnt--)
+	    local_result ^= *l_src++;
+	  break;
+	case UPC_LOGAND:
+	  while (loop_cnt--)
+	    local_result = local_result && *l_src++;
+	  break;
+	case UPC_LOGOR:
+	  while (loop_cnt--)
+	    local_result = local_result || *l_src++;
+	  break;
+	case UPC_MIN:
+	  while (loop_cnt--)
+	    {
+	      if (local_result > *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_MAX:
+	  while (loop_cnt--)
+	    {
+	      if (local_result < *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	case UPC_NONCOMM_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	default:
+	  gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+	}
+    }
+
+  /* Note: local_result is undefined if n_local == 0.
+     Note: Only a proper subset of threads have a meaningful local_result.
+     Note: dst might be a thread that does not have a local result.  */
+
+  /* Global reduce on only participating threads.  */
+  if (n_local)
+    {
+      /* Local pointer where reduced values are written too.  */
+      signed long *t_result =
+	(signed long *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+      /* Initialize collectives reduce tree.  */
+      gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+      /* Copy in local results into the area for reduce operation.
+         NOTE: Not needed for the case of collective functions. However,
+         this covers the case of only one thread.  */
+      *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions.  */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+      /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+         to use triggered operations on inner nodes. In that case, inner
+         nodes must calculate reduced value by calling the specified
+         function.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  if (IS_ROOT_THREAD)
+	    {
+	      /* ROOT THREAD */
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for children to report their values.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+	      /* Reduce local values with those of children if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result =
+			func (local_result, *(signed long *)
+			      & gupcr_reduce_storage[MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		}
+	    }
+	  else
+	    {
+	      /* INNER THREAD */
+	      /* Prepare triggered atomic function.  */
+	      if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+		{
+		  /* Use triggered atomic operations once children sent
+		     their results and parent is ready to receive it.  */
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (signed long),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_LONG,
+					     gupcr_coll_child_cnt + 1);
+		}
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for completion, children and parent are ready.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+	      /* Execute reduce functions if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  size_t doffset =
+		    upc_addrfield ((shared void *)
+				   &(gupcr_reduce_storage[MYTHREAD].value
+				     [gupcr_coll_child_index]));
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result = func (local_result, *(signed long *)
+					   &
+					   gupcr_reduce_storage
+					   [MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		  gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+				  sizeof (signed long));
+		}
+	      /* Wait for our value to go up the tree.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+      else
+	{
+	  /* Avoid the case where only one thread is available.  */
+	  if (!IS_ROOT_THREAD)
+	    {
+	      /* LEAF THREAD */
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      switch (op)
+		{
+		case UPC_FUNC:
+		case UPC_NONCOMM_FUNC:
+		  {
+		    /* Schedule a triggered put once signal is received.  */
+		    size_t doffset = upc_addrfield ((shared void *)
+						    &(gupcr_reduce_storage
+						      [MYTHREAD].
+						      value
+						      [gupcr_coll_child_index]));
+		    gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+					offset, sizeof (signed long), 1);
+		  }
+		  break;
+		default:
+		  /* Schedule a triggered atomic put once parent is ready.  */
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (signed long),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_LONG, 1);
+		  break;
+		}
+	      /* Wait for parent to be ready.  */
+	      gupcr_coll_signal_wait (1);
+	      /* Wait for our value to leave.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+#else /* NO TRIGGERED OPS */
+      /* Send signal to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  /* ROOT OR INNER THREAD */
+	  int wait_cnt = gupcr_coll_child_cnt;
+
+	  /* Signal that parent is ready to receive the locally reduced
+	     values from its children. Value that we send does not matter.  */
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].signal));
+	      gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+	    }
+	  gupcr_coll_ack_wait (wait_cnt);
+
+	  /* Wait for children to report their local reduced values and
+	     parent to report it is ready to receive the reduced value.  */
+	  if (!IS_ROOT_THREAD)
+	    ++wait_cnt;
+	  gupcr_coll_signal_wait (wait_cnt);
+
+	  /* Compute result if reduce functions are used.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  local_result = func (local_result,
+				       *(signed long *) &
+				       gupcr_reduce_storage[MYTHREAD].value
+				       [i]);
+		}
+	      /* Prepare reduced value for going up the tree.  */
+	      *t_result = local_result;
+	    }
+	}
+      else if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF THREAD */
+	  gupcr_coll_signal_wait (1);
+	}
+
+      /* Send reduced value to the parent.  */
+      if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF OR INNER THREAD */
+	  /* Each child places its result into the parent memory slot
+	     dedicated for the child. The parent is responsible
+	     for creating the reduced result for itself and its
+	     children.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      size_t doffset = upc_addrfield ((shared void *)
+					      &(gupcr_reduce_storage
+						[MYTHREAD].value
+						[gupcr_coll_child_index]));
+	      size_t soffset =
+		upc_addrfield ((shared void *)
+			       &(gupcr_reduce_storage[MYTHREAD].value[0]));
+	      gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+			      sizeof (signed long));
+	    }
+	  else
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+				     sizeof (signed long),
+				     gupcr_portals_reduce_op (op),
+				     UPC_COLL_TO_PTL_LONG);
+	    }
+	  gupcr_coll_ack_wait (1);
+	}
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+      /* Copy result into the caller's specified destination.  */
+      if (IS_ROOT_THREAD)
+	{
+	  *(shared signed long *) dst = *t_result;
+	}
+    }
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+
+/**
+ * Collectives reduce (UL) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ *   some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ *   for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ *   parent shared space. The reduced values are propagated to the
+ *   top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceUL
+  (shared void *dst,
+   shared const void *src,
+   upc_op_t op,
+   size_t nelems,
+   size_t blk_size,
+   unsigned long (*func) (unsigned long, unsigned long), upc_flag_t sync_mode)
+{
+  int i, n_local, full_rows, last_row;
+  int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+  unsigned long local_result = 0;
+  unsigned long *l_src;
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER unsigned long %lu %lu",
+	       (long unsigned) nelems, (long unsigned) blk_size);
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  /* Compute n_local, the number of elements local to this thread.  */
+  n_local = 0;
+
+  /* Also compute start, the starting index of src for each thread.  */
+
+  src_thr = upc_threadof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+  ph = upc_phaseof ((shared void *) src);
+
+  /* nelems plus the number of virtual elements in first row.  */
+  velems = nelems + src_thr * blk_size + ph;
+
+  /* Include virtual elements when computing number of local elements.  */
+  full_rows = velems / (blk_size * THREADS);
+  last_row = velems % (blk_size * THREADS);
+  tail_thr = last_row / blk_size;
+
+  /* Calculate number of participating threads.  */
+  num_thr = (nelems + ph + blk_size - 1) / blk_size;
+  if (num_thr > THREADS)
+    num_thr = THREADS;
+
+  gupcr_debug (FC_COLL,
+	       "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+	       src_thr, tail_thr, ph, num_thr, full_rows);
+
+  /* Calculate number of local elements.  */
+  if (blk_size > 0)
+    {
+      if (MYTHREAD <= tail_thr)
+	if (MYTHREAD == tail_thr)
+	  extras = last_row % blk_size;
+	else
+	  extras = blk_size;
+      else
+	extras = 0;
+
+      n_local = blk_size * full_rows + extras;
+
+      /* Adjust the number of elements in this thread, if necessary.  */
+      if (MYTHREAD < src_thr)
+	n_local -= blk_size;
+      else if (MYTHREAD == src_thr)
+	n_local -= ph;
+    }
+  else
+    {
+      n_local = 0;
+      if (src_thr == MYTHREAD)	/* Revise the number of local elements.  */
+	n_local = nelems;
+    }
+
+  /* Starting index for this thread
+     Note: start is sometimes negative because src is
+     addressed here as if its block size is 1.  */
+
+  if (blk_size > 0)
+    if (MYTHREAD > src_thr)
+      start = MYTHREAD - src_thr - ph * THREADS;
+    else if (MYTHREAD < src_thr)
+      start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+    else			/* This is the source thread.  */
+      start = 0;
+  else
+    start = 0;
+
+
+  /* Reduce the elements local to this thread.  */
+
+  if (n_local > 0)
+    {
+      int loop_cnt = n_local - 1;
+
+      l_src = (unsigned long *) ((shared const unsigned long *) src + start);
+      local_result = *l_src++;
+
+      switch (op)
+	{
+	case UPC_ADD:
+	  while (loop_cnt--)
+	    local_result += *l_src++;
+	  break;
+	case UPC_MULT:
+	  while (loop_cnt--)
+	    local_result *= *l_src++;
+	  break;
+	  /* Skip if not integral type, per spec 4.3.1.1
+	     (See additional comments in upc_collective.c) */
+	case UPC_AND:
+	  while (loop_cnt--)
+	    local_result &= *l_src++;
+	  break;
+	case UPC_OR:
+	  while (loop_cnt--)
+	    local_result |= *l_src++;
+	  break;
+	case UPC_XOR:
+	  while (loop_cnt--)
+	    local_result ^= *l_src++;
+	  break;
+	case UPC_LOGAND:
+	  while (loop_cnt--)
+	    local_result = local_result && *l_src++;
+	  break;
+	case UPC_LOGOR:
+	  while (loop_cnt--)
+	    local_result = local_result || *l_src++;
+	  break;
+	case UPC_MIN:
+	  while (loop_cnt--)
+	    {
+	      if (local_result > *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_MAX:
+	  while (loop_cnt--)
+	    {
+	      if (local_result < *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	case UPC_NONCOMM_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	default:
+	  gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+	}
+    }
+
+  /* Note: local_result is undefined if n_local == 0.
+     Note: Only a proper subset of threads have a meaningful local_result.
+     Note: dst might be a thread that does not have a local result.  */
+
+  /* Global reduce on only participating threads.  */
+  if (n_local)
+    {
+      /* Local pointer where reduced values are written too.  */
+      unsigned long *t_result =
+	(unsigned long *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+      /* Initialize collectives reduce tree.  */
+      gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+      /* Copy in local results into the area for reduce operation.
+         NOTE: Not needed for the case of collective functions. However,
+         this covers the case of only one thread.  */
+      *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions.  */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+      /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+         to use triggered operations on inner nodes. In that case, inner
+         nodes must calculate reduced value by calling the specified
+         function.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  if (IS_ROOT_THREAD)
+	    {
+	      /* ROOT THREAD */
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for children to report their values.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+	      /* Reduce local values with those of children if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result =
+			func (local_result, *(unsigned long *)
+			      & gupcr_reduce_storage[MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		}
+	    }
+	  else
+	    {
+	      /* INNER THREAD */
+	      /* Prepare triggered atomic function.  */
+	      if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+		{
+		  /* Use triggered atomic operations once children sent
+		     their results and parent is ready to receive it.  */
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (unsigned long),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_ULONG,
+					     gupcr_coll_child_cnt + 1);
+		}
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for completion, children and parent are ready.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+	      /* Execute reduce functions if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  size_t doffset =
+		    upc_addrfield ((shared void *)
+				   &(gupcr_reduce_storage[MYTHREAD].value
+				     [gupcr_coll_child_index]));
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result = func (local_result, *(unsigned long *)
+					   &
+					   gupcr_reduce_storage
+					   [MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		  gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+				  sizeof (unsigned long));
+		}
+	      /* Wait for our value to go up the tree.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+      else
+	{
+	  /* Avoid the case where only one thread is available.  */
+	  if (!IS_ROOT_THREAD)
+	    {
+	      /* LEAF THREAD */
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      switch (op)
+		{
+		case UPC_FUNC:
+		case UPC_NONCOMM_FUNC:
+		  {
+		    /* Schedule a triggered put once signal is received.  */
+		    size_t doffset = upc_addrfield ((shared void *)
+						    &(gupcr_reduce_storage
+						      [MYTHREAD].
+						      value
+						      [gupcr_coll_child_index]));
+		    gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+					offset, sizeof (unsigned long), 1);
+		  }
+		  break;
+		default:
+		  /* Schedule a triggered atomic put once parent is ready.  */
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (unsigned long),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_ULONG, 1);
+		  break;
+		}
+	      /* Wait for parent to be ready.  */
+	      gupcr_coll_signal_wait (1);
+	      /* Wait for our value to leave.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+#else /* NO TRIGGERED OPS */
+      /* Send signal to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  /* ROOT OR INNER THREAD */
+	  int wait_cnt = gupcr_coll_child_cnt;
+
+	  /* Signal that parent is ready to receive the locally reduced
+	     values from its children. Value that we send does not matter.  */
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].signal));
+	      gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+	    }
+	  gupcr_coll_ack_wait (wait_cnt);
+
+	  /* Wait for children to report their local reduced values and
+	     parent to report it is ready to receive the reduced value.  */
+	  if (!IS_ROOT_THREAD)
+	    ++wait_cnt;
+	  gupcr_coll_signal_wait (wait_cnt);
+
+	  /* Compute result if reduce functions are used.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  local_result = func (local_result,
+				       *(unsigned long *) &
+				       gupcr_reduce_storage[MYTHREAD].value
+				       [i]);
+		}
+	      /* Prepare reduced value for going up the tree.  */
+	      *t_result = local_result;
+	    }
+	}
+      else if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF THREAD */
+	  gupcr_coll_signal_wait (1);
+	}
+
+      /* Send reduced value to the parent.  */
+      if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF OR INNER THREAD */
+	  /* Each child places its result into the parent memory slot
+	     dedicated for the child. The parent is responsible
+	     for creating the reduced result for itself and its
+	     children.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      size_t doffset = upc_addrfield ((shared void *)
+					      &(gupcr_reduce_storage
+						[MYTHREAD].value
+						[gupcr_coll_child_index]));
+	      size_t soffset =
+		upc_addrfield ((shared void *)
+			       &(gupcr_reduce_storage[MYTHREAD].value[0]));
+	      gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+			      sizeof (unsigned long));
+	    }
+	  else
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+				     sizeof (unsigned long),
+				     gupcr_portals_reduce_op (op),
+				     UPC_COLL_TO_PTL_ULONG);
+	    }
+	  gupcr_coll_ack_wait (1);
+	}
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+      /* Copy result into the caller's specified destination.  */
+      if (IS_ROOT_THREAD)
+	{
+	  *(shared unsigned long *) dst = *t_result;
+	}
+    }
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+/**
+ * Collectives UPC_LOGAND function for float types
+ *
+ * Portals4 does not define logical AND atomic operations
+ * and they will be executed as functions.
+ */
+  float
+gupcr_coll_logandF (float a, float b)
+{
+  return a && b;
+}
+
+/**
+ * Collectives UPC_LOGOR function for float types
+ *
+ * Portals4 does not define logical OR atomic operations
+ * and they will be executed as functions.
+ */
+
+float
+gupcr_coll_logorF (float a, float b)
+{
+  return a || b;
+}
+
+/**
+ * Collectives reduce (F) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ *   some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ *   for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ *   parent shared space. The reduced values are propagated to the
+ *   top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceF
+  (shared void *dst,
+   shared const void *src,
+   upc_op_t op,
+   size_t nelems,
+   size_t blk_size,
+   float (*func) (float, float), upc_flag_t sync_mode)
+{
+  int i, n_local, full_rows, last_row;
+  int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+  float local_result = 0;
+  float *l_src;
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER float %lu %lu",
+	       (long unsigned) nelems, (long unsigned) blk_size);
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  /* Compute n_local, the number of elements local to this thread.  */
+  n_local = 0;
+
+  /* Also compute start, the starting index of src for each thread.  */
+
+  src_thr = upc_threadof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+  ph = upc_phaseof ((shared void *) src);
+
+  /* nelems plus the number of virtual elements in first row.  */
+  velems = nelems + src_thr * blk_size + ph;
+
+  /* Include virtual elements when computing number of local elements.  */
+  full_rows = velems / (blk_size * THREADS);
+  last_row = velems % (blk_size * THREADS);
+  tail_thr = last_row / blk_size;
+
+  /* Calculate number of participating threads.  */
+  num_thr = (nelems + ph + blk_size - 1) / blk_size;
+  if (num_thr > THREADS)
+    num_thr = THREADS;
+
+  gupcr_debug (FC_COLL,
+	       "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+	       src_thr, tail_thr, ph, num_thr, full_rows);
+
+  /* Calculate number of local elements.  */
+  if (blk_size > 0)
+    {
+      if (MYTHREAD <= tail_thr)
+	if (MYTHREAD == tail_thr)
+	  extras = last_row % blk_size;
+	else
+	  extras = blk_size;
+      else
+	extras = 0;
+
+      n_local = blk_size * full_rows + extras;
+
+      /* Adjust the number of elements in this thread, if necessary.  */
+      if (MYTHREAD < src_thr)
+	n_local -= blk_size;
+      else if (MYTHREAD == src_thr)
+	n_local -= ph;
+    }
+  else
+    {
+      n_local = 0;
+      if (src_thr == MYTHREAD)	/* Revise the number of local elements.  */
+	n_local = nelems;
+    }
+
+  /* Starting index for this thread
+     Note: start is sometimes negative because src is
+     addressed here as if its block size is 1.  */
+
+  if (blk_size > 0)
+    if (MYTHREAD > src_thr)
+      start = MYTHREAD - src_thr - ph * THREADS;
+    else if (MYTHREAD < src_thr)
+      start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+    else			/* This is the source thread.  */
+      start = 0;
+  else
+    start = 0;
+
+  /* Logical operations on floating point types must execute as
+     functions as Portals4 does not have support for them.  */
+  switch (op)
+    {
+    case UPC_LOGAND:
+      func = &gupcr_coll_logandF;
+      op = UPC_FUNC;
+      break;
+    case UPC_LOGOR:
+      func = &gupcr_coll_logorF;
+      op = UPC_FUNC;
+      break;
+    }
+
+  /* Reduce the elements local to this thread.  */
+
+  if (n_local > 0)
+    {
+      int loop_cnt = n_local - 1;
+
+      l_src = (float *) ((shared const float *) src + start);
+      local_result = *l_src++;
+
+      switch (op)
+	{
+	case UPC_ADD:
+	  while (loop_cnt--)
+	    local_result += *l_src++;
+	  break;
+	case UPC_MULT:
+	  while (loop_cnt--)
+	    local_result *= *l_src++;
+	  break;
+	case UPC_MIN:
+	  while (loop_cnt--)
+	    {
+	      if (local_result > *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_MAX:
+	  while (loop_cnt--)
+	    {
+	      if (local_result < *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	case UPC_NONCOMM_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	default:
+	  gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+	}
+    }
+
+  /* Note: local_result is undefined if n_local == 0.
+     Note: Only a proper subset of threads have a meaningful local_result.
+     Note: dst might be a thread that does not have a local result.  */
+
+  /* Global reduce on only participating threads.  */
+  if (n_local)
+    {
+      /* Local pointer where reduced values are written too.  */
+      float *t_result =
+	(float *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+      /* Initialize collectives reduce tree.  */
+      gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+      /* Copy in local results into the area for reduce operation.
+         NOTE: Not needed for the case of collective functions. However,
+         this covers the case of only one thread.  */
+      *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions.  */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+      /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+         to use triggered operations on inner nodes. In that case, inner
+         nodes must calculate reduced value by calling the specified
+         function.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  if (IS_ROOT_THREAD)
+	    {
+	      /* ROOT THREAD */
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for children to report their values.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+	      /* Reduce local values with those of children if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result =
+			func (local_result, *(float *)
+			      & gupcr_reduce_storage[MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		}
+	    }
+	  else
+	    {
+	      /* INNER THREAD */
+	      /* Prepare triggered atomic function.  */
+	      if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+		{
+		  /* Use triggered atomic operations once children sent
+		     their results and parent is ready to receive it.  */
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (float),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_FLOAT,
+					     gupcr_coll_child_cnt + 1);
+		}
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for completion, children and parent are ready.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+	      /* Execute reduce functions if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  size_t doffset =
+		    upc_addrfield ((shared void *)
+				   &(gupcr_reduce_storage[MYTHREAD].value
+				     [gupcr_coll_child_index]));
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result = func (local_result, *(float *)
+					   &
+					   gupcr_reduce_storage
+					   [MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		  gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+				  sizeof (float));
+		}
+	      /* Wait for our value to go up the tree.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+      else
+	{
+	  /* Avoid the case where only one thread is available.  */
+	  if (!IS_ROOT_THREAD)
+	    {
+	      /* LEAF THREAD */
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      switch (op)
+		{
+		case UPC_FUNC:
+		case UPC_NONCOMM_FUNC:
+		  {
+		    /* Schedule a triggered put once signal is received.  */
+		    size_t doffset = upc_addrfield ((shared void *)
+						    &(gupcr_reduce_storage
+						      [MYTHREAD].
+						      value
+						      [gupcr_coll_child_index]));
+		    gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+					offset, sizeof (float), 1);
+		  }
+		  break;
+		default:
+		  /* Schedule a triggered atomic put once parent is ready.  */
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (float),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_FLOAT, 1);
+		  break;
+		}
+	      /* Wait for parent to be ready.  */
+	      gupcr_coll_signal_wait (1);
+	      /* Wait for our value to leave.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+#else /* NO TRIGGERED OPS */
+      /* Send signal to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  /* ROOT OR INNER THREAD */
+	  int wait_cnt = gupcr_coll_child_cnt;
+
+	  /* Signal that parent is ready to receive the locally reduced
+	     values from its children. Value that we send does not matter.  */
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].signal));
+	      gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+	    }
+	  gupcr_coll_ack_wait (wait_cnt);
+
+	  /* Wait for children to report their local reduced values and
+	     parent to report it is ready to receive the reduced value.  */
+	  if (!IS_ROOT_THREAD)
+	    ++wait_cnt;
+	  gupcr_coll_signal_wait (wait_cnt);
+
+	  /* Compute result if reduce functions are used.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  local_result = func (local_result,
+				       *(float *) &
+				       gupcr_reduce_storage[MYTHREAD].value
+				       [i]);
+		}
+	      /* Prepare reduced value for going up the tree.  */
+	      *t_result = local_result;
+	    }
+	}
+      else if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF THREAD */
+	  gupcr_coll_signal_wait (1);
+	}
+
+      /* Send reduced value to the parent.  */
+      if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF OR INNER THREAD */
+	  /* Each child places its result into the parent memory slot
+	     dedicated for the child. The parent is responsible
+	     for creating the reduced result for itself and its
+	     children.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      size_t doffset = upc_addrfield ((shared void *)
+					      &(gupcr_reduce_storage
+						[MYTHREAD].value
+						[gupcr_coll_child_index]));
+	      size_t soffset =
+		upc_addrfield ((shared void *)
+			       &(gupcr_reduce_storage[MYTHREAD].value[0]));
+	      gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+			      sizeof (float));
+	    }
+	  else
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+				     sizeof (float),
+				     gupcr_portals_reduce_op (op),
+				     UPC_COLL_TO_PTL_FLOAT);
+	    }
+	  gupcr_coll_ack_wait (1);
+	}
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+      /* Copy result into the caller's specified destination.  */
+      if (IS_ROOT_THREAD)
+	{
+	  *(shared float *) dst = *t_result;
+	}
+    }
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+/**
+ * Collectives UPC_LOGAND function for float types
+ *
+ * Portals4 does not define logical AND atomic operations
+ * and they will be executed as functions.
+ */
+  double
+gupcr_coll_logandD (double a, double b)
+{
+  return a && b;
+}
+
+/**
+ * Collectives UPC_LOGOR function for float types
+ *
+ * Portals4 does not define logical OR atomic operations
+ * and they will be executed as functions.
+ */
+
+double
+gupcr_coll_logorD (double a, double b)
+{
+  return a || b;
+}
+
+/**
+ * Collectives reduce (D) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ *   some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ *   for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ *   parent shared space. The reduced values are propagated to the
+ *   top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceD
+  (shared void *dst,
+   shared const void *src,
+   upc_op_t op,
+   size_t nelems,
+   size_t blk_size,
+   double (*func) (double, double), upc_flag_t sync_mode)
+{
+  int i, n_local, full_rows, last_row;
+  int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+  double local_result = 0;
+  double *l_src;
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER double %lu %lu",
+	       (long unsigned) nelems, (long unsigned) blk_size);
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  /* Compute n_local, the number of elements local to this thread.  */
+  n_local = 0;
+
+  /* Also compute start, the starting index of src for each thread.  */
+
+  src_thr = upc_threadof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+  ph = upc_phaseof ((shared void *) src);
+
+  /* nelems plus the number of virtual elements in first row.  */
+  velems = nelems + src_thr * blk_size + ph;
+
+  /* Include virtual elements when computing number of local elements.  */
+  full_rows = velems / (blk_size * THREADS);
+  last_row = velems % (blk_size * THREADS);
+  tail_thr = last_row / blk_size;
+
+  /* Calculate number of participating threads.  */
+  num_thr = (nelems + ph + blk_size - 1) / blk_size;
+  if (num_thr > THREADS)
+    num_thr = THREADS;
+
+  gupcr_debug (FC_COLL,
+	       "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+	       src_thr, tail_thr, ph, num_thr, full_rows);
+
+  /* Calculate number of local elements.  */
+  if (blk_size > 0)
+    {
+      if (MYTHREAD <= tail_thr)
+	if (MYTHREAD == tail_thr)
+	  extras = last_row % blk_size;
+	else
+	  extras = blk_size;
+      else
+	extras = 0;
+
+      n_local = blk_size * full_rows + extras;
+
+      /* Adjust the number of elements in this thread, if necessary.  */
+      if (MYTHREAD < src_thr)
+	n_local -= blk_size;
+      else if (MYTHREAD == src_thr)
+	n_local -= ph;
+    }
+  else
+    {
+      n_local = 0;
+      if (src_thr == MYTHREAD)	/* Revise the number of local elements.  */
+	n_local = nelems;
+    }
+
+  /* Starting index for this thread
+     Note: start is sometimes negative because src is
+     addressed here as if its block size is 1.  */
+
+  if (blk_size > 0)
+    if (MYTHREAD > src_thr)
+      start = MYTHREAD - src_thr - ph * THREADS;
+    else if (MYTHREAD < src_thr)
+      start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+    else			/* This is the source thread.  */
+      start = 0;
+  else
+    start = 0;
+
+  /* Logical operations on floating point types must execute as
+     functions as Portals4 does not have support for them.  */
+  switch (op)
+    {
+    case UPC_LOGAND:
+      func = &gupcr_coll_logandD;
+      op = UPC_FUNC;
+      break;
+    case UPC_LOGOR:
+      func = &gupcr_coll_logorD;
+      op = UPC_FUNC;
+      break;
+    }
+
+  /* Reduce the elements local to this thread.  */
+
+  if (n_local > 0)
+    {
+      int loop_cnt = n_local - 1;
+
+      l_src = (double *) ((shared const double *) src + start);
+      local_result = *l_src++;
+
+      switch (op)
+	{
+	case UPC_ADD:
+	  while (loop_cnt--)
+	    local_result += *l_src++;
+	  break;
+	case UPC_MULT:
+	  while (loop_cnt--)
+	    local_result *= *l_src++;
+	  break;
+	case UPC_MIN:
+	  while (loop_cnt--)
+	    {
+	      if (local_result > *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_MAX:
+	  while (loop_cnt--)
+	    {
+	      if (local_result < *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	case UPC_NONCOMM_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	default:
+	  gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+	}
+    }
+
+  /* Note: local_result is undefined if n_local == 0.
+     Note: Only a proper subset of threads have a meaningful local_result.
+     Note: dst might be a thread that does not have a local result.  */
+
+  /* Global reduce on only participating threads.  */
+  if (n_local)
+    {
+      /* Local pointer where reduced values are written too.  */
+      double *t_result =
+	(double *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+      /* Initialize collectives reduce tree.  */
+      gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+      /* Copy in local results into the area for reduce operation.
+         NOTE: Not needed for the case of collective functions. However,
+         this covers the case of only one thread.  */
+      *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions.  */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+      /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+         to use triggered operations on inner nodes. In that case, inner
+         nodes must calculate reduced value by calling the specified
+         function.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  if (IS_ROOT_THREAD)
+	    {
+	      /* ROOT THREAD */
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for children to report their values.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+	      /* Reduce local values with those of children if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result =
+			func (local_result, *(double *)
+			      & gupcr_reduce_storage[MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		}
+	    }
+	  else
+	    {
+	      /* INNER THREAD */
+	      /* Prepare triggered atomic function.  */
+	      if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+		{
+		  /* Use triggered atomic operations once children sent
+		     their results and parent is ready to receive it.  */
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (double),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_DOUBLE,
+					     gupcr_coll_child_cnt + 1);
+		}
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for completion, children and parent are ready.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+	      /* Execute reduce functions if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  size_t doffset =
+		    upc_addrfield ((shared void *)
+				   &(gupcr_reduce_storage[MYTHREAD].value
+				     [gupcr_coll_child_index]));
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result = func (local_result, *(double *)
+					   &
+					   gupcr_reduce_storage
+					   [MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		  gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+				  sizeof (double));
+		}
+	      /* Wait for our value to go up the tree.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+      else
+	{
+	  /* Avoid the case where only one thread is available.  */
+	  if (!IS_ROOT_THREAD)
+	    {
+	      /* LEAF THREAD */
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      switch (op)
+		{
+		case UPC_FUNC:
+		case UPC_NONCOMM_FUNC:
+		  {
+		    /* Schedule a triggered put once signal is received.  */
+		    size_t doffset = upc_addrfield ((shared void *)
+						    &(gupcr_reduce_storage
+						      [MYTHREAD].
+						      value
+						      [gupcr_coll_child_index]));
+		    gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+					offset, sizeof (double), 1);
+		  }
+		  break;
+		default:
+		  /* Schedule a triggered atomic put once parent is ready.  */
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (double),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_DOUBLE, 1);
+		  break;
+		}
+	      /* Wait for parent to be ready.  */
+	      gupcr_coll_signal_wait (1);
+	      /* Wait for our value to leave.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+#else /* NO TRIGGERED OPS */
+      /* Send signal to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  /* ROOT OR INNER THREAD */
+	  int wait_cnt = gupcr_coll_child_cnt;
+
+	  /* Signal that parent is ready to receive the locally reduced
+	     values from its children. Value that we send does not matter.  */
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].signal));
+	      gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+	    }
+	  gupcr_coll_ack_wait (wait_cnt);
+
+	  /* Wait for children to report their local reduced values and
+	     parent to report it is ready to receive the reduced value.  */
+	  if (!IS_ROOT_THREAD)
+	    ++wait_cnt;
+	  gupcr_coll_signal_wait (wait_cnt);
+
+	  /* Compute result if reduce functions are used.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  local_result = func (local_result,
+				       *(double *) &
+				       gupcr_reduce_storage[MYTHREAD].value
+				       [i]);
+		}
+	      /* Prepare reduced value for going up the tree.  */
+	      *t_result = local_result;
+	    }
+	}
+      else if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF THREAD */
+	  gupcr_coll_signal_wait (1);
+	}
+
+      /* Send reduced value to the parent.  */
+      if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF OR INNER THREAD */
+	  /* Each child places its result into the parent memory slot
+	     dedicated for the child. The parent is responsible
+	     for creating the reduced result for itself and its
+	     children.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      size_t doffset = upc_addrfield ((shared void *)
+					      &(gupcr_reduce_storage
+						[MYTHREAD].value
+						[gupcr_coll_child_index]));
+	      size_t soffset =
+		upc_addrfield ((shared void *)
+			       &(gupcr_reduce_storage[MYTHREAD].value[0]));
+	      gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+			      sizeof (double));
+	    }
+	  else
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+				     sizeof (double),
+				     gupcr_portals_reduce_op (op),
+				     UPC_COLL_TO_PTL_DOUBLE);
+	    }
+	  gupcr_coll_ack_wait (1);
+	}
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+      /* Copy result into the caller's specified destination.  */
+      if (IS_ROOT_THREAD)
+	{
+	  *(shared double *) dst = *t_result;
+	}
+    }
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
+
+/**
+ * Collectives UPC_LOGAND function for float types
+ *
+ * Portals4 does not define logical AND atomic operations
+ * and they will be executed as functions.
+ */
+  long double
+gupcr_coll_logandLD (long double a, long double b)
+{
+  return a && b;
+}
+
+/**
+ * Collectives UPC_LOGOR function for float types
+ *
+ * Portals4 does not define logical OR atomic operations
+ * and they will be executed as functions.
+ */
+
+long double
+gupcr_coll_logorLD (long double a, long double b)
+{
+  return a || b;
+}
+
+/**
+ * Collectives reduce (LD) function
+ *
+ * The following steps are taken to calculate the reduced value:
+ *
+ * - Each thread reduces the values it has affinity to. Note that
+ *   some of the threads might not participate in collectives reduce.
+ * - A reduce tree is created out of the threads participating.
+ * - All the parent threads signal their children that they are ready
+ *   for the collectives reduce operation.
+ * - All the children perform atomic portals reduce operations in the
+ *   parent shared space. The reduced values are propagated to the
+ *   top of the tree.
+ * - Result is written to the specified destination.
+ *
+ * @param [in] dst Destination shared pointer
+ * @param [in] src Source shared pointer
+ * @param [in] op Collectives reduce operation
+ * @param [in] nelems Number of elements
+ * @param [in] blk_size Block size
+ * @param [in] func Optional reduce function
+ * @param [in] sync_mode Synchronization mode
+ *
+ */
+void upc_all_reduceLD
+  (shared void *dst,
+   shared const void *src,
+   upc_op_t op,
+   size_t nelems,
+   size_t blk_size,
+   long double (*func) (long double, long double), upc_flag_t sync_mode)
+{
+  int i, n_local, full_rows, last_row;
+  int num_thr, tail_thr, extras, ph, src_thr, dst_thr, velems, start;
+
+  long double local_result = 0;
+  long double *l_src;
+
+  if (!upc_coll_init_flag)
+    upc_coll_init ();
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE ENTER long double %lu %lu",
+	       (long unsigned) nelems, (long unsigned) blk_size);
+
+  if (blk_size == 0)
+    blk_size = nelems;
+
+#ifdef _UPC_COLL_CHECK_ARGS
+  upc_coll_err (dst, src, NULL, 0, sync_mode, blk_size, nelems, op, UPC_RED);
+#endif
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_IN_MYSYNC & sync_mode || !(UPC_IN_NOSYNC & sync_mode))
+    upc_barrier;
+
+  /* Compute n_local, the number of elements local to this thread.  */
+  n_local = 0;
+
+  /* Also compute start, the starting index of src for each thread.  */
+
+  src_thr = upc_threadof ((shared void *) src);
+  dst_thr = upc_threadof ((shared void *) dst);
+  ph = upc_phaseof ((shared void *) src);
+
+  /* nelems plus the number of virtual elements in first row.  */
+  velems = nelems + src_thr * blk_size + ph;
+
+  /* Include virtual elements when computing number of local elements.  */
+  full_rows = velems / (blk_size * THREADS);
+  last_row = velems % (blk_size * THREADS);
+  tail_thr = last_row / blk_size;
+
+  /* Calculate number of participating threads.  */
+  num_thr = (nelems + ph + blk_size - 1) / blk_size;
+  if (num_thr > THREADS)
+    num_thr = THREADS;
+
+  gupcr_debug (FC_COLL,
+	       "src_thr: %d tail_thr: %d ph: %d num_thr: %d full_rows: %d",
+	       src_thr, tail_thr, ph, num_thr, full_rows);
+
+  /* Calculate number of local elements.  */
+  if (blk_size > 0)
+    {
+      if (MYTHREAD <= tail_thr)
+	if (MYTHREAD == tail_thr)
+	  extras = last_row % blk_size;
+	else
+	  extras = blk_size;
+      else
+	extras = 0;
+
+      n_local = blk_size * full_rows + extras;
+
+      /* Adjust the number of elements in this thread, if necessary.  */
+      if (MYTHREAD < src_thr)
+	n_local -= blk_size;
+      else if (MYTHREAD == src_thr)
+	n_local -= ph;
+    }
+  else
+    {
+      n_local = 0;
+      if (src_thr == MYTHREAD)	/* Revise the number of local elements.  */
+	n_local = nelems;
+    }
+
+  /* Starting index for this thread
+     Note: start is sometimes negative because src is
+     addressed here as if its block size is 1.  */
+
+  if (blk_size > 0)
+    if (MYTHREAD > src_thr)
+      start = MYTHREAD - src_thr - ph * THREADS;
+    else if (MYTHREAD < src_thr)
+      start = (blk_size - ph) * THREADS + MYTHREAD - src_thr;
+    else			/* This is the source thread.  */
+      start = 0;
+  else
+    start = 0;
+
+  /* Logical operations on floating point types must execute as
+     functions as Portals4 does not have support for them.  */
+  switch (op)
+    {
+    case UPC_LOGAND:
+      func = &gupcr_coll_logandLD;
+      op = UPC_FUNC;
+      break;
+    case UPC_LOGOR:
+      func = &gupcr_coll_logorLD;
+      op = UPC_FUNC;
+      break;
+    }
+
+  /* Reduce the elements local to this thread.  */
+
+  if (n_local > 0)
+    {
+      int loop_cnt = n_local - 1;
+
+      l_src = (long double *) ((shared const long double *) src + start);
+      local_result = *l_src++;
+
+      switch (op)
+	{
+	case UPC_ADD:
+	  while (loop_cnt--)
+	    local_result += *l_src++;
+	  break;
+	case UPC_MULT:
+	  while (loop_cnt--)
+	    local_result *= *l_src++;
+	  break;
+	case UPC_MIN:
+	  while (loop_cnt--)
+	    {
+	      if (local_result > *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_MAX:
+	  while (loop_cnt--)
+	    {
+	      if (local_result < *l_src)
+		local_result = *l_src;
+	      ++l_src;
+	    }
+	  break;
+	case UPC_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	case UPC_NONCOMM_FUNC:
+	  while (loop_cnt--)
+	    local_result = func (local_result, *l_src++);
+	  break;
+	default:
+	  gupcr_fatal_error ("bad UPC collectives reduce operator 0x%lx", op);
+	}
+    }
+
+  /* Note: local_result is undefined if n_local == 0.
+     Note: Only a proper subset of threads have a meaningful local_result.
+     Note: dst might be a thread that does not have a local result.  */
+
+  /* Global reduce on only participating threads.  */
+  if (n_local)
+    {
+      /* Local pointer where reduced values are written too.  */
+      long double *t_result =
+	(long double *) & gupcr_reduce_storage[MYTHREAD].value[0];
+
+      /* Initialize collectives reduce tree.  */
+      gupcr_coll_tree_setup (dst_thr, src_thr, num_thr);
+
+      /* Copy in local results into the area for reduce operation.
+         NOTE: Not needed for the case of collective functions. However,
+         this covers the case of only one thread.  */
+      *t_result = local_result;
+
+#ifdef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+/* Run reduce operation without triggered functions.  */
+#undef GUPCR_USE_PORTALS4_TRIGGERED_OPS
+#endif
+#if GUPCR_USE_PORTALS4_TRIGGERED_OPS
+      /* Note: In the case of UPC_FUNC and UPC_NONCOMM, it is not possible
+         to use triggered operations on inner nodes. In that case, inner
+         nodes must calculate reduced value by calling the specified
+         function.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  if (IS_ROOT_THREAD)
+	    {
+	      /* ROOT THREAD */
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for children to report their values.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt);
+
+	      /* Reduce local values with those of children if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result =
+			func (local_result, *(long double *)
+			      & gupcr_reduce_storage[MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		}
+	    }
+	  else
+	    {
+	      /* INNER THREAD */
+	      /* Prepare triggered atomic function.  */
+	      if ((op != UPC_FUNC) && (op != UPC_NONCOMM_FUNC))
+		{
+		  /* Use triggered atomic operations once children sent
+		     their results and parent is ready to receive it.  */
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (long double),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_LONG_DOUBLE,
+					     gupcr_coll_child_cnt + 1);
+		}
+	      /* Let children know that parent is ready.  */
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].signal));
+		  gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+		}
+	      gupcr_coll_ack_wait (gupcr_coll_child_cnt);
+
+	      /* Wait for completion, children and parent are ready.  */
+	      gupcr_coll_signal_wait (gupcr_coll_child_cnt + 1);
+	      /* Execute reduce functions if necessary.  */
+	      if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+		{
+		  size_t offset = upc_addrfield ((shared void *)
+						 &(gupcr_reduce_storage
+						   [MYTHREAD].value[0]));
+		  size_t doffset =
+		    upc_addrfield ((shared void *)
+				   &(gupcr_reduce_storage[MYTHREAD].value
+				     [gupcr_coll_child_index]));
+		  /* Reduce local result with those of children.  */
+		  for (i = 0; i < gupcr_coll_child_cnt; i++)
+		    {
+		      local_result = func (local_result, *(long double *)
+					   &
+					   gupcr_reduce_storage
+					   [MYTHREAD].value[i]);
+		    }
+		  *t_result = local_result;
+		  gupcr_coll_put (gupcr_coll_parent_thread, doffset, offset,
+				  sizeof (long double));
+		}
+	      /* Wait for our value to go up the tree.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+      else
+	{
+	  /* Avoid the case where only one thread is available.  */
+	  if (!IS_ROOT_THREAD)
+	    {
+	      /* LEAF THREAD */
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      switch (op)
+		{
+		case UPC_FUNC:
+		case UPC_NONCOMM_FUNC:
+		  {
+		    /* Schedule a triggered put once signal is received.  */
+		    size_t doffset = upc_addrfield ((shared void *)
+						    &(gupcr_reduce_storage
+						      [MYTHREAD].
+						      value
+						      [gupcr_coll_child_index]));
+		    gupcr_coll_trigput (gupcr_coll_parent_thread, doffset,
+					offset, sizeof (long double), 1);
+		  }
+		  break;
+		default:
+		  /* Schedule a triggered atomic put once parent is ready.  */
+		  gupcr_coll_trigput_atomic (gupcr_coll_parent_thread, offset,
+					     offset, sizeof (long double),
+					     gupcr_portals_reduce_op (op),
+					     UPC_COLL_TO_PTL_LONG_DOUBLE, 1);
+		  break;
+		}
+	      /* Wait for parent to be ready.  */
+	      gupcr_coll_signal_wait (1);
+	      /* Wait for our value to leave.  */
+	      gupcr_coll_ack_wait (1);
+	    }
+	}
+#else /* NO TRIGGERED OPS */
+      /* Send signal to all children.  */
+      if (gupcr_coll_child_cnt)
+	{
+	  /* ROOT OR INNER THREAD */
+	  int wait_cnt = gupcr_coll_child_cnt;
+
+	  /* Signal that parent is ready to receive the locally reduced
+	     values from its children. Value that we send does not matter.  */
+	  for (i = 0; i < gupcr_coll_child_cnt; i++)
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].signal));
+	      gupcr_coll_put (gupcr_coll_child[i], offset, offset, 1);
+	    }
+	  gupcr_coll_ack_wait (wait_cnt);
+
+	  /* Wait for children to report their local reduced values and
+	     parent to report it is ready to receive the reduced value.  */
+	  if (!IS_ROOT_THREAD)
+	    ++wait_cnt;
+	  gupcr_coll_signal_wait (wait_cnt);
+
+	  /* Compute result if reduce functions are used.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      for (i = 0; i < gupcr_coll_child_cnt; i++)
+		{
+		  local_result = func (local_result,
+				       *(long double *) &
+				       gupcr_reduce_storage[MYTHREAD].value
+				       [i]);
+		}
+	      /* Prepare reduced value for going up the tree.  */
+	      *t_result = local_result;
+	    }
+	}
+      else if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF THREAD */
+	  gupcr_coll_signal_wait (1);
+	}
+
+      /* Send reduced value to the parent.  */
+      if (!IS_ROOT_THREAD)
+	{
+	  /* LEAF OR INNER THREAD */
+	  /* Each child places its result into the parent memory slot
+	     dedicated for the child. The parent is responsible
+	     for creating the reduced result for itself and its
+	     children.  */
+	  if ((op == UPC_FUNC) || (op == UPC_NONCOMM_FUNC))
+	    {
+	      size_t doffset = upc_addrfield ((shared void *)
+					      &(gupcr_reduce_storage
+						[MYTHREAD].value
+						[gupcr_coll_child_index]));
+	      size_t soffset =
+		upc_addrfield ((shared void *)
+			       &(gupcr_reduce_storage[MYTHREAD].value[0]));
+	      gupcr_coll_put (gupcr_coll_parent_thread, doffset, soffset,
+			      sizeof (long double));
+	    }
+	  else
+	    {
+	      size_t offset = upc_addrfield ((shared void *)
+					     &(gupcr_reduce_storage
+					       [MYTHREAD].value[0]));
+	      gupcr_coll_put_atomic (gupcr_coll_parent_thread, offset, offset,
+				     sizeof (long double),
+				     gupcr_portals_reduce_op (op),
+				     UPC_COLL_TO_PTL_LONG_DOUBLE);
+	    }
+	  gupcr_coll_ack_wait (1);
+	}
+#endif /* GUPCR_USE_PORTALS4_TRIGGERED_OPS */
+
+      /* Copy result into the caller's specified destination.  */
+      if (IS_ROOT_THREAD)
+	{
+	  *(shared long double *) dst = *t_result;
+	}
+    }
+
+  /* Synchronize using barriers in the cases of MYSYNC and ALLSYNC.  */
+  if (UPC_OUT_MYSYNC & sync_mode || !(UPC_OUT_NOSYNC & sync_mode))
+    upc_barrier;
+
+  gupcr_trace (FC_COLL, "COLL ALL_REDUCE EXIT");
+}
Index: libgupc/portals4/gupcr_coll_sup.c
===================================================================
--- libgupc/portals4/gupcr_coll_sup.c	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_coll_sup.c	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,393 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_lib.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_coll_sup.h"
+
+/**
+ * @file gupcr_coll_sup.c
+ * GUPC Portals4 collectives implementation support routines.
+ *
+ * @addtogroup COLLECTIVES GUPCR Collectives Functions
+ * @{
+ */
+
+/** Collectives shared access LE handle */
+static ptl_handle_le_t gupcr_coll_le;
+/** Collectives shared access LE counting events handle */
+static ptl_handle_ct_t gupcr_coll_le_ct;
+/** Collectives shared access LE events queue handle */
+static ptl_handle_eq_t gupcr_coll_le_eq;
+/** Collectives number of received signals (PUT/ATOMIC) through LE */
+static ptl_size_t gupcr_coll_signal_cnt;
+
+/** Collectives local access MD handle */
+static ptl_handle_md_t gupcr_coll_md;
+/** Collectives local access MD counting events handle */
+static ptl_handle_ct_t gupcr_coll_md_ct;
+/** Collectives local access MD event queue handle */
+static ptl_handle_eq_t gupcr_coll_md_eq;
+/** Collectives number of received ACKs on local md */
+static ptl_size_t gupcr_coll_ack_cnt;
+
+/* Collectives thread tree.  */
+/** Collectives tree parent thread */
+int gupcr_coll_parent_thread;
+/** Collectives tree number of children */
+int gupcr_coll_child_cnt;
+/** Collectives tree child's index */
+int gupcr_coll_child_index;
+/** Collectives tree children threads */
+int gupcr_coll_child[GUPCR_TREE_FANOUT];
+
+/**
+ * Initialize collectives thread tree.
+ *
+ * A collectives tree starts from the "start" thread number and
+ * includes only "nthreads" (e.g. threads involved in
+ * the reduce process).  The simplest case is when all the
+ * threads are involved in which case start=0 and
+ * nthreads=THREADS (e.g. used for broadcast).
+ *
+ * The collectives thread tree can be organized in a
+ * form where the "newroot" value identifies
+ * the root thread (only if the "newroot" thread
+ * is participating in the operation).
+ * @param [in] newroot A hint for the tree root thread.
+ * @param [in] start Start thread for reduce
+ * @param [in] nthreads Number of threads participating
+ *
+ */
+void
+gupcr_coll_tree_setup (size_t newroot, size_t start, int nthreads)
+{
+/* Convert from/to 0-(THREADS-1) to start-(nthreads-1) range.  */
+#define NEWID(id,first) ((id - first + THREADS) % THREADS)
+#define OLDID(nid,first) ((nid + first) % THREADS)
+
+/* Remap into the new root (from root 0 to "root").  */
+#define NEWIDROOT(id,top,cnt) ((cnt + id - top) % cnt)
+#define OLDIDROOT(nid,top,cnt) ((nid + top) % cnt)
+  int i;
+  int ok_to_root = 0;
+  int myid;
+  int root = NEWID (newroot, start);
+
+  gupcr_debug (FC_COLL, "newroot: %lu, start: %lu nthreads: %d",
+	       (long unsigned) newroot, (long unsigned) start, nthreads);
+
+  /* Check if root node is participating.  If yes, use that for the
+     root, otherwise 0.  */
+  if (root < nthreads)
+    ok_to_root = 1;
+
+  /* Get myid - first convert into the new range (0-nthreads),
+     then, if needed and possible, into the range where newroot becomes 0.  */
+  myid = NEWID (MYTHREAD, start);
+  if (ok_to_root)
+    myid = NEWIDROOT (myid, root, nthreads);
+
+  /* Calculate the thread id's of the children and parent.  */
+  gupcr_coll_child_cnt = 0;
+  for (i = 0; i < GUPCR_TREE_FANOUT; i++)
+    {
+      int child = (GUPCR_TREE_FANOUT * myid + i + 1);
+      if (child < nthreads)
+	{
+	  ++gupcr_coll_child_cnt;
+	  if (ok_to_root)
+	    child = OLDIDROOT (child, root, nthreads);
+	  gupcr_coll_child[i] = OLDID (child, start);
+	}
+    }
+  if (myid)
+    {
+      gupcr_coll_parent_thread = (myid - 1) / GUPCR_TREE_FANOUT;
+      gupcr_coll_child_index =
+	myid - gupcr_coll_parent_thread * GUPCR_TREE_FANOUT - 1;
+      if (ok_to_root)
+	gupcr_coll_parent_thread =
+	  OLDIDROOT (gupcr_coll_parent_thread, root, nthreads);
+      gupcr_coll_parent_thread = OLDID (gupcr_coll_parent_thread, start);
+    }
+  else
+    gupcr_coll_parent_thread = ROOT_PARENT;
+}
+
+/**
+ * Collective PUT operation
+ *
+ * @param [in] dthread Destination thread
+ * @param [in] doffset Destination offset in the shared space
+ * @param [in] soffset Source offset in the shared space
+ * @param [in] nbytes Number of bytes to copy
+ */
+
+void
+gupcr_coll_put (size_t dthread, size_t doffset, size_t soffset, size_t nbytes)
+{
+  ptl_process_t rpid;
+
+  gupcr_debug (FC_COLL, "%d:0x%lx %lu:0x%lx %lu",
+	       MYTHREAD, (long unsigned) soffset,
+	       (long unsigned) dthread, (long unsigned) doffset,
+	       (long unsigned) nbytes);
+  rpid.rank = dthread;
+  gupcr_portals_call (PtlPut,
+		      (gupcr_coll_md, soffset, nbytes, PTL_ACK_REQ, rpid,
+		       GUPCR_PTL_PTE_COLL, PTL_NO_MATCH_BITS, doffset,
+		       PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA));
+}
+
+/**
+ * Collective triggered PUT operation
+ *
+ * Schedule put operation once number of signals reaches
+ * the specified value.
+ *
+ * @param [in] dthread Destination thread
+ * @param [in] doffset Destination offset in the shared space
+ * @param [in] soffset Source offset in the shared space
+ * @param [in] nbytes Number of bytes to copy
+ * @param [in] cnt Trigger count
+ */
+void
+gupcr_coll_trigput (size_t dthread, size_t doffset, size_t soffset,
+		    size_t nbytes, size_t cnt)
+{
+  ptl_process_t rpid;
+
+  gupcr_debug (FC_COLL, "%d:0x%lx -> %lu:0x%lx %lu trig %lu",
+	       MYTHREAD, (long unsigned) soffset,
+	       (long unsigned) dthread, (long unsigned) doffset,
+	       (long unsigned) nbytes, (long unsigned) cnt);
+  rpid.rank = dthread;
+  gupcr_portals_call (PtlTriggeredPut,
+		      (gupcr_coll_md, soffset, nbytes, PTL_ACK_REQ, rpid,
+		       GUPCR_PTL_PTE_COLL, PTL_NO_MATCH_BITS, doffset,
+		       PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA, gupcr_coll_le_ct,
+		       gupcr_coll_signal_cnt + cnt));
+}
+
+/**
+ * Collective atomic PUT operation.
+ *
+ * @param [in] dthread Destination thread
+ * @param [in] doffset Destination offset in the shared space
+ * @param [in] soffset Source offset in the shared space
+ * @param [in] nbytes Number of bytes to copy
+ * @param [in] op Portals atomic operation
+ * @param [in] datatype Portals atomic data type
+ */
+
+void
+gupcr_coll_put_atomic (size_t dthread, size_t doffset, size_t soffset,
+		       size_t nbytes, ptl_op_t op, ptl_datatype_t datatype)
+{
+  ptl_process_t rpid;
+
+  gupcr_debug (FC_COLL, "%d:0x%lx %lu:0x%lx %lu %s %s",
+	       MYTHREAD, (long unsigned) soffset,
+	       (long unsigned) dthread, (long unsigned) doffset,
+	       (long unsigned) nbytes,
+	       gupcr_strptlop (op), gupcr_strptldatatype (datatype));
+  rpid.rank = dthread;
+  gupcr_portals_call (PtlAtomic,
+		      (gupcr_coll_md, soffset, nbytes, PTL_ACK_REQ, rpid,
+		       GUPCR_PTL_PTE_COLL, PTL_NO_MATCH_BITS, doffset,
+		       PTL_NULL_USER_PTR, PTL_NULL_HDR_DATA, op, datatype));
+}
+
+/**
+ * Collective triggered atomic PUT operation.
+ *
+ * Schedule atomic put operation once number of signals reaches
+ * the specified value.
+ *
+ * @param [in] dthread Destination thread
+ * @param [in] doffset Destination offset in the shared space
+ * @param [in] soffset Source offset in the shared space
+ * @param [in] nbytes Number of bytes to copy
+ * @param [in] op Portals atomic operation
+ * @param [in] datatype Portals atomic data type
+ * @param [in] cnt Number of signals that triggers
+ */
+void
+gupcr_coll_trigput_atomic (size_t dthread, size_t doffset, size_t soffset,
+			   size_t nbytes, ptl_op_t op,
+			   ptl_datatype_t datatype, size_t cnt)
+{
+  ptl_process_t rpid;
+
+  gupcr_debug (FC_COLL, "%d:0x%lx %lu:0x%lx %lu %s %s trig %lu",
+	       MYTHREAD, (long unsigned) soffset,
+	       (long unsigned) dthread, (long unsigned) doffset,
+	       (long unsigned) nbytes,
+	       gupcr_strptlop (op), gupcr_strptldatatype (datatype),
+	       (long unsigned) cnt);
+  rpid.rank = dthread;
+  gupcr_portals_call (PtlTriggeredAtomic,
+		      (gupcr_coll_md, soffset,
+		       nbytes, PTL_ACK_REQ, rpid, GUPCR_PTL_PTE_COLL,
+		       PTL_NO_MATCH_BITS, doffset, PTL_NULL_USER_PTR,
+		       PTL_NULL_HDR_DATA, op, datatype, gupcr_coll_le_ct,
+		       cnt));
+}
+
+/**
+ * Collectives wait for operation completion
+ * This function is used in cases where threads needs to wait
+ * for the completion of remote operations.
+ *
+ * @param [in] cnt Wait count
+ */
+void
+gupcr_coll_ack_wait (size_t cnt)
+{
+  ptl_ct_event_t ct;
+  gupcr_debug (FC_COLL, "wait for %lu (%lu)",
+               (long unsigned) cnt,
+	       (long unsigned) (gupcr_coll_ack_cnt + cnt));
+  gupcr_portals_call (PtlCTWait,
+		      (gupcr_coll_md_ct, gupcr_coll_ack_cnt + cnt, &ct));
+  if (ct.failure)
+    {
+      gupcr_process_fail_events (gupcr_coll_md_eq);
+      gupcr_fatal_error ("received an error on collective MD");
+    }
+  gupcr_coll_ack_cnt += cnt;
+}
+
+/**
+ * Collectives wait for signaling events
+ * This function is used to wait for other threads to complete
+ * operations in the thread's shared space (e.g. children performing
+ * atomic ops in the parent's shared space).
+ *
+ * @param [in] cnt Wait count
+ */
+void
+gupcr_coll_signal_wait (size_t cnt)
+{
+  ptl_ct_event_t ct;
+
+  gupcr_debug (FC_COLL, "wait for %lu (%lu)",
+	       (long unsigned) cnt,
+	       (long unsigned) (gupcr_coll_signal_cnt + cnt));
+  gupcr_portals_call (PtlCTWait,
+		      (gupcr_coll_le_ct, gupcr_coll_signal_cnt + cnt, &ct));
+  if (ct.failure)
+    {
+      gupcr_process_fail_events (gupcr_coll_le_eq);
+      gupcr_fatal_error ("received an error on collective LE");
+    }
+  gupcr_coll_signal_cnt += cnt;
+}
+
+/**
+ * Initialize collectives resources.
+ * @ingroup INIT
+ *
+ * A thread's shared space is mapped via a Portals LE for other
+ * threads to write to, and an MD as a source for remote
+ * operations.  In this way, the address filed of the shared pointer
+ * can be used as an offset into LE/MD.
+ */
+void
+gupcr_coll_init (void)
+{
+  ptl_md_t md;
+  ptl_pt_index_t pte;
+  ptl_le_t le;
+
+  gupcr_log (FC_COLL, "coll init called");
+
+  /* Allocate the Portals PTE that is used for collectives.  */
+  gupcr_portals_call (PtlEQAlloc, (gupcr_ptl_ni, 1, &gupcr_coll_le_eq));
+  gupcr_portals_call (PtlPTAlloc, (gupcr_ptl_ni, 0,
+				   gupcr_coll_le_eq, GUPCR_PTL_PTE_COLL,
+				   &pte));
+  if (pte != GUPCR_PTL_PTE_COLL)
+    gupcr_fatal_error ("cannot allocate PTE GUPCR_PTL_PTE_COLL.");
+  gupcr_debug (FC_COLL, "Collectives PTE allocated: %d", GUPCR_PTL_PTE_COLL);
+
+  /* Allocate the Portals LE that is used for collectives.  */
+  gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_coll_le_ct));
+  le.start = gupcr_gmem_base;
+  le.length = gupcr_gmem_size;
+  le.ct_handle = gupcr_coll_le_ct;
+  le.uid = PTL_UID_ANY;
+  le.options = PTL_LE_OP_PUT | PTL_LE_OP_GET | PTL_LE_EVENT_CT_COMM |
+    PTL_LE_EVENT_SUCCESS_DISABLE | PTL_LE_EVENT_LINK_DISABLE;
+  gupcr_portals_call (PtlLEAppend, (gupcr_ptl_ni, GUPCR_PTL_PTE_COLL, &le,
+				    PTL_PRIORITY_LIST, NULL, &gupcr_coll_le));
+  gupcr_debug (FC_COLL, "Collectives LE created at 0x%lx size 0x%lx",
+	       (long unsigned) gupcr_gmem_base,
+	       (long unsigned) gupcr_gmem_size);
+
+  /* Setup the Portals MD for local source/destination copying.
+     We need to map only the shared memory space.  */
+  gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_coll_md_ct));
+  gupcr_portals_call (PtlEQAlloc, (gupcr_ptl_ni, 1, &gupcr_coll_md_eq));
+  md.start = gupcr_gmem_base;
+  md.length = gupcr_gmem_size;
+  md.options = PTL_MD_EVENT_CT_ACK | PTL_MD_EVENT_CT_REPLY |
+    PTL_MD_EVENT_SUCCESS_DISABLE;
+  md.eq_handle = gupcr_coll_md_eq;
+  md.ct_handle = gupcr_coll_md_ct;
+  gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_coll_md));
+
+  /* Reset the number of signals/acks.  */
+  gupcr_coll_signal_cnt = 0;
+  gupcr_coll_ack_cnt = 0;
+}
+
+/**
+ * Release collectives resources.
+ * @ingroup INIT
+ */
+void
+gupcr_coll_fini (void)
+{
+  gupcr_log (FC_COLL, "coll fini called");
+  /* Release the collectives MD.  */
+  gupcr_portals_call (PtlMDRelease, (gupcr_coll_md));
+  gupcr_portals_call (PtlCTFree, (gupcr_coll_md_ct));
+  gupcr_portals_call (PtlEQFree, (gupcr_coll_md_eq));
+  /* Release the collectives LE and PTE.  */
+  gupcr_portals_call (PtlLEUnlink, (gupcr_coll_le));
+  gupcr_portals_call (PtlCTFree, (gupcr_coll_le_ct));
+  gupcr_portals_call (PtlEQFree, (gupcr_coll_le_eq));
+  gupcr_portals_call (PtlPTFree, (gupcr_ptl_ni, GUPCR_PTL_PTE_COLL));
+}
+
+/** @} */
Index: libgupc/portals4/gupcr_coll_sup.h
===================================================================
--- libgupc/portals4/gupcr_coll_sup.h	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_coll_sup.h	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,106 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef _GUPCR_COLL_SUP_H_
+#define _GUPCR_COLL_SUP_H_ 1
+
+/**
+ * @file gupcr_coll_sup.h
+ * GUPC Portals4 collectives implementation support routines.
+ *
+ * @addtogroup COLLECTIVES GUPCR Collectives Functions
+ * @{
+ */
+
+/** Convert from UPC collectives char to Portals atomic type.  */
+#define UPC_COLL_TO_PTL_CHAR PTL_INT8_T
+#define UPC_COLL_TO_PTL_UCHAR PTL_UINT8_T
+/** Convert from UPC collectives short to Portals atomic type.  */
+#if __SIZEOF_SHORT__ == 2
+#define UPC_COLL_TO_PTL_SHORT PTL_INT16_T
+#define UPC_COLL_TO_PTL_USHORT PTL_UINT16_T
+#elif __SIZEOF_SHORT__ == 4
+#define UPC_COLL_TO_PTL_SHORT PTL_INT32_T
+#define UPC_COLL_TO_PTL_USHORT PTL_UINT32_T
+#else
+#error "Size of short not supported"
+#endif
+/** Convert from UPC collectives int to Portals atomic type.  */
+#if __SIZEOF_INT__ == 4
+#define UPC_COLL_TO_PTL_INT PTL_INT32_T
+#define UPC_COLL_TO_PTL_UINT PTL_UINT32_T
+#elif __SIZEOF_INT__ == 8
+#define UPC_COLL_TO_PTL_INT PTL_INT64_T
+#define UPC_COLL_TO_PTL_UINT PTL_UINT64_T
+#else
+#error "Size of int not supported"
+#endif
+/** Convert from UPC collectives long to Portals atomic type.  */
+#if __SIZEOF_LONG__ == 4
+#define UPC_COLL_TO_PTL_LONG PTL_INT32_T
+#define UPC_COLL_TO_PTL_ULONG PTL_UINT32_T
+#elif __SIZEOF_LONG__ == 8
+#define UPC_COLL_TO_PTL_LONG PTL_INT64_T
+#define UPC_COLL_TO_PTL_ULONG PTL_UINT64_T
+#else
+#error "Size of long not supported"
+#endif
+/** Convert from UPC collectives float to Portals atomic type.  */
+#define UPC_COLL_TO_PTL_FLOAT PTL_FLOAT
+/** Convert from UPC collectives double to Portals atomic type.  */
+#define UPC_COLL_TO_PTL_DOUBLE PTL_DOUBLE
+/** Convert from UPC collectives long double to Portals atomic type.  */
+#define UPC_COLL_TO_PTL_LONG_DOUBLE PTL_LONG_DOUBLE
+
+extern int gupcr_coll_parent_thread;
+extern int gupcr_coll_child_cnt;
+extern int gupcr_coll_child_index;
+extern int gupcr_coll_child[GUPCR_TREE_FANOUT];
+
+/** Check if thread is the root thread by checking its parent.  */
+#define IS_ROOT_THREAD (gupcr_coll_parent_thread == ROOT_PARENT)
+
+void gupcr_coll_tree_setup (size_t newroot, size_t start, int nthreads);
+void gupcr_coll_put (size_t dthread,
+		     size_t doffset, size_t soffset, size_t nbytes);
+void gupcr_coll_trigput (size_t dthread,
+			 size_t doffset, size_t soffset, size_t nbytes,
+			 size_t cnt);
+void gupcr_coll_put_atomic (size_t dthread, size_t doffset, size_t soffset,
+			    size_t nbytes, ptl_op_t op,
+			    ptl_datatype_t datatype);
+void gupcr_coll_trigput_atomic (size_t dthread, size_t doffset,
+				size_t soffset, size_t nbytes, ptl_op_t op,
+				ptl_datatype_t datatype, size_t cnt);
+void gupcr_coll_ack_wait (size_t cnt);
+void gupcr_coll_signal_wait (size_t cnt);
+
+void gupcr_coll_init (void);
+void gupcr_coll_fini (void);
+
+/** @} */
+
+#endif /* gupcr_coll_sup.h */
Index: libgupc/portals4/gupcr_config.h
===================================================================
--- libgupc/portals4/gupcr_config.h	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_config.h	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,180 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/**
+ * @file gupcr_config.h
+ * GUPC Runtime configuration
+ */
+
+#ifndef _GUPCR_CONFIG_H_
+#define _GUPCR_CONFIG_H_
+
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <libgen.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <time.h>
+#if TIME_WITH_SYS_TIME
+#include <sys/time.h>
+#endif
+#include <sys/time.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#ifdef _POSIX_PRIORITY_SCHEDULING
+#define __USE_GNU
+#include <sched.h>
+#endif
+
+#include "config.h"
+
+#define DEV_ZERO "/dev/zero"
+#define OFFSET_ZERO ((off_t) 0)
+/* Darwin has MAP_ANON defined for anonymous memory map.  */
+#if !MAP_ANONYMOUS && MAP_ANON
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+#define MAP_ERROR ((void *) -1)
+
+#define GUPCR_SPIN_THREAD_SLOTS 4
+#define GUPCR_SPIN_SLOT_COUNT 64
+#define GUPCR_SPIN_MAX_MULT 1024
+
+#define KILOBYTE 1024
+#define C64K (64*KILOBYTE)
+#define MEGABYTE (KILOBYTE*KILOBYTE)
+#ifndef INT_MIN
+/** __INT_MAX__ is predefined by the gcc compiler.  */
+#define INT_MIN (-__INT_MAX__ - 1)
+#endif
+
+//begin detect_target64
+#if (defined (_LP64) && _LP64)
+#define GUPCR_TARGET64 1
+#else
+#define GUPCR_TARGET64 0
+#endif
+//end detect_target64
+
+//begin mode_types
+typedef unsigned int u_intQI_t __attribute__ ((__mode__ (__QI__)));
+typedef unsigned int u_intHI_t __attribute__ ((__mode__ (__HI__)));
+typedef unsigned int u_intSI_t __attribute__ ((__mode__ (__SI__)));
+typedef unsigned int u_intDI_t __attribute__ ((__mode__ (__DI__)));
+#if GUPCR_TARGET64
+typedef unsigned int u_intTI_t __attribute__ ((__mode__ (__TI__)));
+#endif /* GUPCR_TARGET64 */
+//end mode_types
+
+//begin lib_min_max
+
+/* Helper functions.  */
+#define GUPCR_MIN(x,y) (((x) < (y)) ? (x): (y))
+#define GUPCR_MAX(x,y) (((x) > (y)) ? (x): (y))
+#define GUPCR_ABS(x) (((x) > 0) ? (x): -(x))
+#define GUPCR_ROUND(x, r) (((x) + (r) - 1)/(r)*(r))
+//end lib_min_max
+
+//begin lib_config_heap
+
+/** Maximum heap size
+    Set here as 64 gigabytes on a 64-bit implementation
+    and 1 gigabyte on other (eg, 32 bit) implementations.  */
+#define GUPCR_MAX_HEAP_SIZE (((sizeof (void *)*8) == 64) \
+                              ? (64L * KILOBYTE * MEGABYTE) \
+			      : ( 1L * KILOBYTE * MEGABYTE))
+
+/** Default per thread UPC shared heap size.  */
+#define GUPCR_DEFAULT_PER_THREAD_HEAP_SIZE (256*MEGABYTE)
+
+/** The minimum number of bytes to allocate (128 bytes).
+
+    This allows for 64 bytes of heap management overhead and 64
+    bytes of allocation.  The allocation will be aligned to a 64
+    byte boundary.  This is not space efficient, but is intended to
+    provide a minimal alignment that agrees with most CPU cache line
+    size requirements.  */
+#define GUPCR_HEAP_ALLOC_MIN 128
+
+/** The minimum number of bytes to allocate (in bits).  */
+#define GUPCR_HEAP_ALLOC_MIN_BITS 7
+
+/** The size of the heap management header block.  */
+#define GUPCR_HEAP_ALLOC_OVERHEAD 64
+
+/** The number of allocation pools per heap.  */
+#define GUPCR_HEAP_NUM_POOLS (SIZE_T_BITS - GUPCR_HEAP_ALLOC_MIN_BITS)
+
+/** An unlikely barrier id to be used for runtime synchronization */
+#define GUPCR_RUNTIME_BARRIER_ID 0xBADF00D
+
+/** A value used to tag each heap allocated item, checked by upc_free */
+#define GUPCR_HEAP_ALLOC_TAG 0x0DDF00D
+
+//end lib_config_heap
+
+/*
+ * Main entry for UPC programs.
+ * The runtime will execute before calling the user's main
+ * program.  Thus, the user's main program will renamed
+ * inside of the <upc.h> file to 'upc_main'
+ */
+#define GUPCR_START main
+#define GUPCR_MAIN upc_main
+
+//begin lib_config_shared_section
+
+/** The base address of the UPC shared section */
+#define GUPCR_SHARED_SECTION_START __upc_shared_start
+/** The ending address (plus one) of the UPC shared section */
+#define GUPCR_SHARED_SECTION_END __upc_shared_end
+
+/** The base address of the UPC compiled program info section */
+#define GUPCR_PGM_INFO_SECTION_START __upc_pgm_info_start
+/** The ending address (plus one) of the UPC compiled program info section */
+#define GUPCR_PGM_INFO_SECTION_END __upc_pgm_info_end
+
+/** The base address of an array of pointers to UPC initialization routines */
+#define GUPCR_INIT_ARRAY_START __upc_init_array_start
+/** The ending address (plus one) of pointers to UPC initialization routines */
+#define GUPCR_INIT_ARRAY_END   __upc_init_array_end
+
+//end lib_config_shared_section
+
+#endif /* gupcr_config.h */
Index: libgupc/portals4/gupcr_defs.h
===================================================================
--- libgupc/portals4/gupcr_defs.h	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_defs.h	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,98 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/**
+ * @file gupcr_defs.h
+ * GUPC Runtime definitions
+ */
+
+#ifndef _GUPCR_DEFS_H_
+#define _GUPCR_DEFS_H_
+
+#include "gupcr_pts.h"
+
+//begin lib_max_threads_def
+
+/* Maximum number of THREADS supported in this implementation */
+#define GUPCR_THREAD_SIZE 12
+#define GUPCR_THREADS_MAX (1 << GUPCR_THREAD_SIZE)
+//end lib_max_threads_def
+
+#if GUPCR_PTS_PACKED_REP && (GUPCR_THREADS_SIZE > GUPCR_PTS_THREAD_SIZE)
+#error GUPCR_THREADS_MAX exceeds the size of the packed sptr threads field.
+#endif
+
+/* The filename of the location where a runtime
+   error was detected.  This is set by the various
+   debug-enabled ('g') UPC runtime library routines.  */
+extern const char *gupcr_err_filename;
+
+/* The line number of the location where a runtime
+   error was detected.  This is set by the various
+   debug-enabled ('g') UPC runtime library routines.  */
+extern unsigned int gupcr_err_linenum;
+
+#define GUPCR_SET_ERR_LOC() \
+  do \
+    { \
+      gupcr_err_filename = filename; \
+      gupcr_err_linenum  = linenum; \
+    } while (0)
+
+#define GUPCR_CLEAR_ERR_LOC() \
+  do \
+    { \
+      gupcr_err_filename = NULL; \
+      gupcr_err_linenum  = 0; \
+    } while (0)
+
+/* The base address of the UPC shared section.  */
+extern char GUPCR_SHARED_SECTION_START[1];
+
+/* The ending address (plus one) of the UPC shared section.  */
+extern char GUPCR_SHARED_SECTION_END[1];
+
+/* The base address of the UPC program information section.  */
+extern char GUPCR_PGM_INFO_SECTION_START[1];
+
+/* The ending address (plus one) of the UPC program information section.  */
+extern char GUPCR_PGM_INFO_SECTION_END[1];
+
+#ifndef __UPC__
+/* The value of THREADS when defined at run time.  */
+extern int THREADS;
+
+/* Current thread id.  */
+extern int MYTHREAD;
+#endif /* !__UPC__ */
+
+/* OK to call finalize routines.  */
+extern int gupcr_finalize_ok;
+
+/* Prototype for the main finalize routine.  */
+extern void gupcr_fini (void);
+
+#endif /* gupcr_defs.h */
Index: libgupc/portals4/gupcr_env.c
===================================================================
--- libgupc/portals4/gupcr_env.c	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_env.c	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,515 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/**
+ * @file gupcr_env.c
+ * GUPC Runtime environment variables handling
+ */
+
+/**
+ * @addtogroup GUPCUTILS GUPCR Utility Functions
+ * @{
+ */
+
+/**
+
+ UPC_BACKTRACE
+
+	If set, enable enable backtrace for runtime fatal events.  By
+	default backtrace logging on fatal events is disabled (even though
+	it may be configured).
+
+ UPC_DEBUG
+
+	If set, specifies a list of "facilities" that
+	will have debugging output logged.
+
+ UPC_DEBUGFILE
+
+	Path of log file where UPC runtime debug logs are written.
+
+ UPC_FIRSTTOUCH
+
+	Not used.  Reserved for future use.
+
+ UPC_FORCETOUCH
+
+	Force the thread to touch every memory page in its own shared
+	memory space on startup.  This ensures the correct NUMA memory
+	allocation.  By default it is "YES".
+
+ UPC_LOG
+
+	If set, specifies a list of "facilities" that
+	will be logged.
+
+ UPC_LOGFILE
+
+	Path of log file where UPC runtime logs are written.
+
+ UPC_NO_WARN
+
+	If set, the UPC_NO_WARN variable causes startup warnings (such as
+	those displayed when debugging or tracing is enabled) to be omitted.
+
+ UPC_NODE_LOCAL_MEM
+
+	If set to "NO", then disable node local memory optimization.
+
+ UPC_NODES
+
+	Not used.  Reserved for future use.
+
+ UPC_QUIET
+
+	UPC_QUIET causes all non-application-generated output to be omitted
+	(including both warnings and the initial display of UPC thread
+	layout).
+
+ UPC_POLITE
+
+	Yield the processor frequently while spin-locking.
+
+ UPC_SHARED_HEAP_SIZE
+
+	UPC_SHARED_HEAP_SIZE sets the amount of shared heap (per UPC thread)
+	for your program.
+
+ UPC_STATS
+
+	If set, specifies a list of "facilities" for
+	which UPC runtime statistics will be collected.
+
+ UPC_STATSFILE
+
+	Path of log file where UPC runtime statistics are written.
+
+ UPC_TRACE
+
+	If set, specifies a list of "facilities" that
+	will be traced.
+
+ UPC_TRACEFILE
+
+	Path of log file where UPC trace logs are written.
+
+ The set of facilities are:
+
+	ADDR		UPC casts to local and access to PTS's.
+	ALL		All the facilities
+	ALLOC		UPC dynamic memory allocation
+	ATOMIC		UPC atomic operations
+	BARRIER 	UPC barrier/notify/wait operations
+	BROADCAST	UPC runtime internal broadcast operations
+	COLL		UPC collectives
+	INFO		General information, program info.
+	LOCKS		UPC lock operations
+	MEM		UPC shared memory accesses
+	MISC		Miscellaneous functions
+	PORTALS		Portals operations
+	SYSTEM		System calls
+
+ For all environment variables above that set a filename path,
+ each appearance of a single '%' will be substituted with the process
+ pid.  Two '%'s together escape a single %.  Non-existent intermediate
+ directories will be created.  As a special case, if the filename
+ is "stdout" or "stderr", then output will be directed to the
+ specified file descriptor.  A filename with no '%' indicates
+ that the file will be shared across all processes.
+
+*/
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_utils.h"
+
+static const struct gupcr_fc_tbl_struct
+{
+  const char *name;
+  gupcr_facility_t mask;
+}
+gupcr_facility_table[] =
+{
+  {"addr", FC_ADDR},
+  {"all", FC_ALL},
+  {"alloc", FC_ALLOC},
+  {"atomic", FC_ATOMIC},
+  {"barrier", FC_BARRIER},
+  {"broadcast", FC_BROADCAST},
+  {"coll", FC_COLL},
+  {"info", FC_INFO},
+  {"locks", FC_LOCK},
+  {"mem", FC_MEM},
+  {"misc", FC_MISC},
+  {"nb", FC_NB},
+  {"portals", FC_PORTALS},
+  {"system", FC_SYSTEM}
+};
+
+#define GUPCR_FC_TBL_SIZE (sizeof (gupcr_facility_table) \
+                           / sizeof (struct gupcr_fc_tbl_struct))
+typedef enum
+{
+  ENV_NONE = 0,
+  ENV_UPC_BACKTRACE,
+  ENV_UPC_DEBUG,
+  ENV_UPC_DEBUGFILE,
+  ENV_UPC_FIRSTTOUCH,
+  ENV_UPC_FORCETOUCH,
+  ENV_UPC_LOG,
+  ENV_UPC_LOGFILE,
+  ENV_UPC_NO_WARN,
+  ENV_UPC_NODE_LOCAL_MEM,
+  ENV_UPC_NODES,
+  ENV_UPC_POLITE,
+  ENV_UPC_REQUIRE_SHARED_SIZE,
+  ENV_UPC_QUIET,
+  ENV_UPC_SHARED_HEAP_SIZE,
+  ENV_UPC_STATS,
+  ENV_UPC_STATSFILE,
+  ENV_UPC_TRACE,
+  ENV_UPC_TRACEFILE
+} gupcr_env_kind;
+
+static const struct gupcr_env_var_struct
+{
+  const char *name;
+  gupcr_env_kind kind;
+}
+gupcr_env_var_table[] =
+{
+  {"UPC_BACKTRACE", ENV_UPC_BACKTRACE},
+  {"UPC_DEBUG", ENV_UPC_DEBUG},
+  {"UPC_DEBUGFILE", ENV_UPC_DEBUGFILE},
+  {"UPC_FIRSTTOUCH", ENV_UPC_FIRSTTOUCH},
+  {"UPC_FORCETOUCH", ENV_UPC_FORCETOUCH},
+  {"UPC_LOG", ENV_UPC_LOG},
+  {"UPC_LOGFILE", ENV_UPC_LOGFILE},
+  {"UPC_NO_WARN", ENV_UPC_NO_WARN},
+  {"UPC_NODE_LOCAL_MEM", ENV_UPC_NODE_LOCAL_MEM},
+  {"UPC_NODES", ENV_UPC_NODES},
+  {"UPC_POLITE", ENV_UPC_POLITE},
+  {"UPC_REQUIRE_SHARED_SIZE", ENV_UPC_REQUIRE_SHARED_SIZE},
+  {"UPC_QUIET", ENV_UPC_QUIET},
+  {"UPC_SHARED_HEAP_SIZE", ENV_UPC_SHARED_HEAP_SIZE},
+  {"UPC_STATS", ENV_UPC_STATS},
+  {"UPC_STATSFILE", ENV_UPC_STATSFILE},
+  {"UPC_TRACE", ENV_UPC_TRACE},
+  {"UPC_TRACEFILE", ENV_UPC_TRACEFILE}
+};
+
+#define GUPCR_ENV_VAR_TBL_SIZE (sizeof (gupcr_env_var_table) \
+                                / sizeof (struct gupcr_env_var_struct))
+
+/* Look up the name given by FACILITY and return the facility mask value
+   associated with that name.  */
+
+static gupcr_facility_t
+gupcr_facility_mask_for_name (const char *const facility)
+{
+  unsigned i;
+  for (i = 0; i < GUPCR_FC_TBL_SIZE; ++i)
+    {
+      if (!strcasecmp (gupcr_facility_table[i].name, facility))
+	return gupcr_facility_table[i].mask;
+    }
+  return FC_NONE;
+}
+
+/* Extract the environment variable name appearing before the
+   first '=' sign in ENV_VAR_ARG; look it up in the list of
+   known "UPC_" environment variables and return an
+   integer value that is used to identify this particular
+   environment variable name.  */
+
+static gupcr_env_kind
+gupcr_env_kind_for_var (const char *const env_var_arg)
+{
+  gupcr_env_kind env_kind = ENV_NONE;
+  unsigned i;
+  char *env_var_dup, *env_var;
+  gupcr_strdup (env_var_dup, env_var_arg);
+  env_var = strtok (env_var_dup, "=");
+  gupcr_assert (env_var != NULL);
+  for (i = 0; i < GUPCR_ENV_VAR_TBL_SIZE; ++i)
+    {
+      if (!strcmp (gupcr_env_var_table[i].name, env_var))
+	return gupcr_env_var_table[i].kind;
+    }
+  gupcr_free (env_var_dup);
+  return env_kind;
+}
+
+/* Process the comma separated list of facility names that
+   appear after the '=' sign.  Return a mask value indicating
+   which facility names were specified.  */
+
+gupcr_facility_t
+gupcr_env_facility_list (const char *const env_var_arg)
+{
+  gupcr_facility_t facility_mask = FC_NONE;
+  char *env_var_dup, *env_var, *facility_name;
+  gupcr_strdup (env_var_dup, env_var_arg);
+  if ((env_var = strtok (env_var_dup, "=")))
+    {
+      while ((facility_name = strtok (NULL, ",")))
+	{
+	  gupcr_facility_t facility;
+	  facility = gupcr_facility_mask_for_name (facility_name);
+	  if (!facility)
+	    gupcr_error ("invalid facility name `%s' found in "
+			 "environment variable: `%s'",
+			 facility_name, env_var_arg);
+	  facility_mask |= facility;
+	}
+    }
+  else
+    gupcr_error ("invalid UPC environment variable syntax: `%s'", env_var);
+  gupcr_free (env_var_dup);
+  return facility_mask;
+}
+
+/* Return a malloc'd copy of ENV_VAR_STR_ARG with
+   the current pid substituted for each occurrence of a '%'.
+   Two '%'s next to each other are equivalent to a single '%'.  */
+
+const char *
+gupcr_env_filename (const char *const env_var_arg)
+{
+  char *env_var_dup, *env_var, *filename_arg;
+  char *filename = NULL;
+  gupcr_strdup (env_var_dup, env_var_arg);
+  if ((env_var = strtok (env_var_dup, "=")))
+    {
+      if ((filename_arg = strtok (NULL, "")))
+	{
+	  const char *const pid = gupcr_get_pid_as_string ();
+	  const char *cp;
+	  char *fp;
+	  size_t filename_len;
+	  size_t pid_len = strlen (pid);
+	  /* Calculate the required string size.  */
+	  for (cp = filename_arg, filename_len = 0; *cp; ++cp)
+	    {
+	      if (cp[0] == '%' && cp[1] == '%')
+		cp += 1, ++filename_len;
+	      else if (cp[0] == '%')
+		filename_len += pid_len;
+	      else
+		++filename_len;
+	    }
+	  /* Allocate the string; copy ENV_VAR_STR_ARG and
+	     make '%' substitutions.  */
+	  gupcr_malloc (filename, filename_len + 1);
+	  for (fp = filename, cp = filename_arg; *cp; ++cp)
+	    {
+	      if (cp[0] == '%' && cp[1] == '%')
+		cp += 1, *fp++ = '%';
+	      else if (cp[0] == '%')
+		strcpy (fp, pid), fp += pid_len;
+	      else
+		*fp++ = *cp;
+	    }
+	  *fp = '\0';
+	}
+      else
+	gupcr_error ("missing file name in UPC environment "
+		     "variable: `%s'", env_var_arg);
+    }
+  else
+    gupcr_error ("invalid UPC environment variable syntax: `%s'",
+		 env_var_arg);
+  gupcr_free (env_var_dup);
+  return filename;
+}
+
+static long long
+gupcr_env_size (const char *const env_var_arg, long long int val_max)
+{
+  long long size = 0;
+  char *env_var, *env_var_name, *size_str;
+  gupcr_strdup (env_var, env_var_arg);
+  if ((env_var_name = strtok (env_var, "=")))
+    {
+      if ((size_str = strtok (NULL, "")))
+	{
+	  int status;
+	  size = gupcr_strtoll (size_str, 0, val_max, &status);
+	  if (status)
+	    {
+	      gupcr_error ("invalid size specifier in UPC environment "
+			   "variable: `%s'", env_var_arg);
+	      gupcr_strtoll_error (size_str, 0, val_max, status);
+	    }
+	}
+      else
+	gupcr_error ("missing size specifier in UPC environment "
+		     "variable: `%s'", env_var_arg);
+    }
+  else
+    gupcr_error ("invalid UPC environment variable syntax: `%s'",
+		 env_var_arg);
+  gupcr_free (env_var);
+  return size;
+}
+
+static int
+gupcr_env_boolean (const char *const env_var_arg)
+{
+  int value = 0;
+  char *env_var, *env_var_name, *switch_str;
+  gupcr_strdup (env_var, env_var_arg);
+  if ((env_var_name = strtok (env_var, "=")))
+    {
+      if ((switch_str = strtok (NULL, "")))
+	{
+	  if (!strcmp (switch_str, "NO") || \
+	      !strcmp (switch_str, "no") || \
+	      !strcmp (switch_str, "0"))
+	    value = 0;
+	  else if (!strcmp (switch_str, "YES") || \
+		   !strcmp (switch_str, "yes") || \
+		   !strcmp (switch_str, "1"))
+	    value = 1;
+	  else
+	    {
+	      gupcr_error ("invalid value specifier in UPC environment "
+			   "variable: `%s'", env_var_arg);
+	    }
+	}
+      else
+	gupcr_error ("missing value specifier in UPC environment "
+		     "variable: `%s'", env_var_arg);
+    }
+  else
+    gupcr_error ("invalid UPC environment variable syntax: `%s'",
+		 env_var_arg);
+  gupcr_free (env_var);
+  return value;
+}
+
+/* Process all variables in the environment that begin with "UPC_".
+   Make various calls back into "gupcr_utils.c"  to implement
+   the actions associated with each given environment variable.  */
+
+void
+gupcr_env_init (void)
+{
+  /* System environment, see:  environ (7).  */
+  extern char **environ;
+  const char *env_var;
+  unsigned i;
+  for (i = 0; (env_var = environ[i]); ++i)
+    {
+      if (!strncmp (env_var, "UPC_", 4))
+	{
+	  const int env_kind = gupcr_env_kind_for_var (env_var);
+	  gupcr_facility_t facility_mask;
+	  const char *filename;
+	  size_t heap_size;
+	  switch (env_kind)
+	    {
+	    case ENV_UPC_BACKTRACE:
+	      gupcr_set_backtrace (gupcr_env_boolean (env_var));
+	      break;
+	    case ENV_UPC_DEBUG:
+	      facility_mask = gupcr_env_facility_list (env_var);
+	      if (facility_mask)
+		gupcr_set_debug_facility (facility_mask);
+	      break;
+	    case ENV_UPC_DEBUGFILE:
+	      filename = gupcr_env_filename (env_var);
+	      if (filename)
+		gupcr_set_debug_filename (filename);
+	      break;
+	    case ENV_UPC_FIRSTTOUCH:
+	      /* no-op */
+	      break;
+	    case ENV_UPC_FORCETOUCH:
+	      gupcr_set_forcetouch (gupcr_env_boolean (env_var));
+	      break;
+	    case ENV_UPC_LOG:
+	      facility_mask = gupcr_env_facility_list (env_var);
+	      if (facility_mask)
+		gupcr_set_log_facility (facility_mask);
+	      break;
+	    case ENV_UPC_LOGFILE:
+	      filename = gupcr_env_filename (env_var);
+	      if (filename)
+		gupcr_set_log_filename (filename);
+	      break;
+	    case ENV_UPC_NO_WARN:
+	      gupcr_no_warn ();
+	      break;
+	    case ENV_UPC_NODE_LOCAL_MEM:
+	      gupcr_set_node_local_memory (gupcr_env_boolean (env_var));
+	      break;
+	    case ENV_UPC_NODES:
+	      /* no-op */
+	      break;
+	    case ENV_UPC_POLITE:
+	      /* no-op */
+	      break;
+	    case ENV_UPC_QUIET:
+	      gupcr_be_quiet ();
+	      break;
+	    case ENV_UPC_SHARED_HEAP_SIZE:
+	      heap_size = (size_t) gupcr_env_size (env_var,
+						   GUPCR_MAX_HEAP_SIZE);
+	      gupcr_set_shared_heap_size (heap_size);
+	      break;
+	    case ENV_UPC_STATS:
+	      facility_mask = gupcr_env_facility_list (env_var);
+	      gupcr_set_stats_facility (facility_mask);
+	      break;
+	    case ENV_UPC_STATSFILE:
+	      filename = gupcr_env_filename (env_var);
+	      if (filename)
+		gupcr_set_stats_filename (filename);
+	      break;
+	    case ENV_UPC_TRACE:
+	      facility_mask = gupcr_env_facility_list (env_var);
+	      gupcr_set_trace_facility (facility_mask);
+	      break;
+	    case ENV_UPC_TRACEFILE:
+	      filename = gupcr_env_filename (env_var);
+	      if (filename)
+		gupcr_set_trace_filename (filename);
+	      break;
+	    case ENV_UPC_REQUIRE_SHARED_SIZE:
+	      /* no-op */
+	      break;
+	    case ENV_NONE:
+	      gupcr_warn ("unknown UPC environment variable: %s", env_var);
+	      break;
+	    default:
+	      gupcr_fatal_error ("env variable case value out of range");
+	    }
+	}
+    }
+}
+
+/** @} */
Index: libgupc/portals4/gupcr_gmem.c
===================================================================
--- libgupc/portals4/gupcr_gmem.c	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_gmem.c	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,521 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/**
+ * @file gupcr_gmem.c
+ * GUPC Portals4 shared memory interface.
+ */
+
+/**
+ * @addtogroup GMEM GUPCR Shared Memory Access
+ * @{
+ */
+
+#include "gupcr_config.h"
+#include "gupcr_defs.h"
+#include "gupcr_sup.h"
+#include "gupcr_portals.h"
+#include "gupcr_node.h"
+#include "gupcr_gmem.h"
+#include "gupcr_utils.h"
+#include "gupcr_sync.h"
+
+/** GMEM LE handle */
+static ptl_handle_le_t gupcr_gmem_le;
+
+/** Thread's default shared heap size */
+#define GUPCR_GMEM_DEFAULT_HEAP_SIZE 256*1024*1024
+
+/** Shared memory base and size */
+void *gupcr_gmem_base;
+size_t gupcr_gmem_size;
+
+/** GET event tracking */
+gupcr_gmem_xfer_info_t gupcr_gmem_gets;
+/** PUT event tracking */
+gupcr_gmem_xfer_info_t gupcr_gmem_puts;
+
+/** PUT "bounce buffer" type */
+typedef char gupcr_gmem_put_bb_t[GUPCR_BOUNCE_BUFFER_SIZE];
+/** PUT "bounce buffer" space */
+static gupcr_gmem_put_bb_t gupcr_gmem_put_bb;
+/** PUT "bounce buffer" memory descriptor handle */
+static ptl_handle_md_t gupcr_gmem_put_bb_md;
+/** PUT "bounce buffer" used counter */
+size_t gupcr_gmem_put_bb_used;
+
+/** Previous operation was a strict put */
+int gupcr_pending_strict_put;
+
+/** Heap base offset relative to start of UPC shared region */
+size_t gupcr_gmem_heap_base_offset;
+
+/** Size of UPC shared region reserved for the heap */
+size_t gupcr_gmem_heap_size;
+
+/** Remote puts flow control */
+static const size_t gupcr_gmem_high_mark_puts = GUPCR_MAX_OUTSTANDING_PUTS;
+static const size_t gupcr_gmem_low_mark_puts = GUPCR_MAX_OUTSTANDING_PUTS / 2;
+
+/**
+ * Allocate memory for this thread's shared space contribution.
+ *
+ * Calculate needed memory size and let the node allocate
+ * shared memory and map other thread's shared memory into
+ * the current thread memory space.
+ */
+static void
+gupcr_gmem_alloc_shared (void)
+{
+  size_t heap_size = GUPCR_ROUND (gupcr_get_shared_heap_size (), C64K);
+  size_t data_size = GUPCR_ROUND (GUPCR_SHARED_SECTION_END -
+				  GUPCR_SHARED_SECTION_START, C64K);
+  gupcr_gmem_heap_base_offset = data_size;
+  gupcr_gmem_heap_size = heap_size;
+  gupcr_gmem_size = heap_size + data_size;
+
+  /* Allocate this thread's shared space.  */
+  gupcr_gmem_base = gupcr_node_local_alloc (gupcr_gmem_size);
+}
+
+/**
+ * Complete all outstanding remote GET operations.
+ *
+ * This procedure waits for all outstanding GET operations
+ * to complete.  If the wait on the Portals GET counting event returns
+ * a failure, a full event queue is checked for failure specifics
+ * and the program aborts.
+ */
+void
+gupcr_gmem_sync_gets (void)
+{
+  /* Sync all outstanding local accesses.  */
+  GUPCR_MEM_BARRIER ();
+  /* Sync all outstanding remote get accesses.  */
+  if (gupcr_gmem_gets.num_pending > 0)
+    {
+      ptl_size_t num_initiated =
+	gupcr_gmem_gets.num_completed + gupcr_gmem_gets.num_pending;
+      ptl_ct_event_t ct;
+      gupcr_debug (FC_MEM, "outstanding gets: %lu",
+		   (long unsigned) gupcr_gmem_gets.num_pending);
+      gupcr_portals_call (PtlCTWait,
+			  (gupcr_gmem_gets.ct_handle, num_initiated, &ct));
+      gupcr_gmem_gets.num_pending = 0;
+      gupcr_gmem_gets.num_completed = num_initiated;
+      if (ct.failure > 0)
+	{
+	  gupcr_process_fail_events (gupcr_gmem_gets.eq_handle);
+	  gupcr_abort ();
+	}
+    }
+}
+
+/**
+ * Complete outstanding remote PUT operations.
+ *
+ * This procedure waits for all outstanding PUT operations
+ * to complete.  If the wait on the Portals PUT counting event returns
+ * a failure, a full event queue is checked for failure specifics
+ * and the program aborts.
+ */
+void
+gupcr_gmem_sync_puts (void)
+{
+  /* Sync all outstanding local accesses.  */
+  GUPCR_MEM_BARRIER ();
+  /* Sync all outstanding remote put accesses.  */
+  if (gupcr_gmem_puts.num_pending > 0)
+    {
+      ptl_size_t num_initiated =
+	gupcr_gmem_puts.num_completed + gupcr_gmem_puts.num_pending;
+      ptl_ct_event_t ct;
+      gupcr_debug (FC_MEM, "outstanding puts: %lu",
+		   (long unsigned) gupcr_gmem_puts.num_pending);
+      gupcr_portals_call (PtlCTWait,
+			  (gupcr_gmem_puts.ct_handle, num_initiated, &ct));
+      gupcr_gmem_puts.num_pending = 0;
+      gupcr_gmem_puts.num_completed = num_initiated;
+      gupcr_pending_strict_put = 0;
+      gupcr_gmem_put_bb_used = 0;
+      if (ct.failure > 0)
+	{
+	  gupcr_process_fail_events (gupcr_gmem_puts.eq_handle);
+	  gupcr_abort ();
+	}
+    }
+}
+
+/**
+ * Complete all outstanding remote operations.
+ *
+ * Check and wait for completion of all PUT/GET operations.
+ */
+void
+gupcr_gmem_sync (void)
+{
+  gupcr_gmem_sync_gets ();
+  gupcr_gmem_sync_puts ();
+}
+
+/**
+ * Read data from remote shared memory.
+ *
+ * A GET request is broken into multiple PtlGet() requests
+ * if the number of requested bytes is greater then
+ * the configuration limited maximum message size.
+ *
+ * @param [in] dest Local memory to receive remote data
+ * @param [in] thread Remote thread to request data from
+ * @param [in] offset Remote address
+ * @param [in] n Number of bytes to transfer
+ */
+void
+gupcr_gmem_get (void *dest, int thread, size_t offset, size_t n)
+{
+  ptl_process_t rpid;
+  char *dest_addr = (char *) (dest - USER_PROG_MEM_START);
+  size_t rem_offset = offset;
+  size_t n_rem = n;
+
+  gupcr_debug (FC_MEM, "%d:0x%lx 0x%lx",
+	       thread, (long unsigned) offset, (long unsigned) dest);
+  rpid.rank = thread;
+  while (n_rem > 0)
+    {
+      size_t n_xfer;
+      n_xfer = GUPCR_MIN (n_rem, (size_t) GUPCR_PORTALS_MAX_MSG_SIZE);
+      ++gupcr_gmem_gets.num_pending;
+      gupcr_portals_call (PtlGet, (gupcr_gmem_gets.md,
+				   (ptl_size_t) dest_addr, n_xfer, rpid,
+				   GUPCR_PTL_PTE_GMEM, PTL_NO_MATCH_BITS,
+				   rem_offset, PTL_NULL_USER_PTR));
+      n_rem -= n_xfer;
+      dest_addr += n_xfer;
+      rem_offset += n_xfer;
+    }
+}
+
+/**
+ * Write data to remote shared memory.
+ *
+ * For data requests smaller then maximum safe size, the data is first
+ * copied into a bounce buffer.  In this way, the put operation
+ * can be non-blocking and there are no restrictions placed upon
+ * the caller's use of the source data buffer.
+ * Otherwise,  a synchronous operation is performed
+ * and this function returns to the caller after the operation completes.
+ *
+ * @param [in] thread Destination thread
+ * @param [in] offset Destination offset
+ * @param [in] src Local source pointer to data
+ * @param [in] n Number of bytes to transfer
+ */
+void
+gupcr_gmem_put (int thread, size_t offset, const void *src, size_t n)
+{
+  int must_sync = (n > GUPCR_GMEM_MAX_SAFE_PUT_SIZE);
+  char *src_addr = (char *) src;
+  size_t n_rem = n;
+  ptl_process_t rpid;
+  gupcr_debug (FC_MEM, "0x%lx %d:0x%lx",
+                       (long unsigned) src, thread, (long unsigned) offset);
+  rpid.rank = thread;
+  /* Large puts must be synchronous, to ensure that it is
+     safe to re-use the source buffer upon return.  */
+  while (n_rem > 0)
+    {
+      size_t n_xfer;
+      ptl_handle_md_t md_handle;
+      ptl_size_t local_offset;
+      n_xfer = GUPCR_MIN (n_rem, (size_t) GUPCR_PORTALS_MAX_MSG_SIZE);
+      if (must_sync)
+	{
+	  local_offset = src_addr - (char *) USER_PROG_MEM_START;
+	  md_handle = gupcr_gmem_puts.md;
+	}
+      else if (n_rem <= GUPCR_PORTALS_MAX_VOLATILE_SIZE)
+	{
+	  local_offset = src_addr - (char *) USER_PROG_MEM_START;
+	  md_handle = gupcr_gmem_puts.md_volatile;
+	}
+      else
+	{
+	  char *bounce_buf;
+	  /* If this transfer will overflow the bounce buffer,
+	     then first wait for all outstanding puts to complete.  */
+	  if ((gupcr_gmem_put_bb_used + n_xfer) > GUPCR_BOUNCE_BUFFER_SIZE)
+	    gupcr_gmem_sync_puts ();
+	  bounce_buf = &gupcr_gmem_put_bb[gupcr_gmem_put_bb_used];
+	  memcpy (bounce_buf, src_addr, n_xfer);
+	  local_offset = bounce_buf - gupcr_gmem_put_bb;
+	  gupcr_gmem_put_bb_used += n_xfer;
+	  md_handle = gupcr_gmem_put_bb_md;
+	}
+      ++gupcr_gmem_puts.num_pending;
+      gupcr_portals_call (PtlPut, (md_handle, local_offset, n_xfer,
+				   PTL_ACK_REQ, rpid,
+				   GUPCR_PTL_PTE_GMEM, PTL_NO_MATCH_BITS,
+				   offset, PTL_NULL_USER_PTR,
+				   PTL_NULL_HDR_DATA));
+      n_rem -= n_xfer;
+      src_addr += n_xfer;
+
+      if (gupcr_gmem_puts.num_pending == gupcr_gmem_high_mark_puts)
+   	{
+	  ptl_ct_event_t ct;
+	  size_t complete_cnt;
+	  size_t wait_cnt = gupcr_gmem_puts.num_completed
+			    + gupcr_gmem_puts.num_pending
+			    - gupcr_gmem_low_mark_puts;
+	  gupcr_portals_call (PtlCTWait,
+			      (gupcr_gmem_puts.ct_handle, wait_cnt, &ct));
+	  if (ct.failure > 0)
+	    {
+	      gupcr_process_fail_events (gupcr_gmem_puts.eq_handle);
+	      gupcr_abort ();
+	    }
+	  complete_cnt = ct.success - gupcr_gmem_puts.num_completed;
+	  gupcr_gmem_puts.num_pending -= complete_cnt;
+	  gupcr_gmem_puts.num_completed = ct.success;
+	}
+    }
+  if (must_sync)
+    gupcr_gmem_sync_puts ();
+}
+
+/**
+ * Copy remote shared memory from the source thread
+ * to the destination thread.
+ *
+ * Bulk copy from one thread to another.
+ * The put bounce buffer is used as an intermediate buffer.
+ * Caller assumes responsibility for checking the validity
+ * of the remote thread id's and/or shared memory offsets.
+ *
+ * @param [in] dthread Destination thread
+ * @param [in] doffset Destination offset
+ * @param [in] sthread Source thread
+ * @param [in] soffset Source offset
+ * @param [in] n Number of bytes to transfer
+ */
+void
+gupcr_gmem_copy (int dthread, size_t doffset,
+		 int sthread, size_t soffset, size_t n)
+{
+  size_t n_rem = n;
+  ptl_size_t dest_addr = doffset;
+  ptl_size_t src_addr = soffset;
+  ptl_process_t dpid;
+  gupcr_debug (FC_MEM, "%d:0x%lx %d:0x%lx %lu",
+	       sthread, (long unsigned) soffset,
+	       dthread, (long unsigned) doffset,
+	       (long unsigned) n);
+  dpid.rank = dthread;
+  while (n_rem > 0)
+    {
+      size_t n_xfer;
+      char *bounce_buf;
+      ptl_size_t local_offset;
+      /* Use the entire put "bounce buffer" if the transfer
+         count is sufficiently large.  */
+      n_xfer = GUPCR_MIN (n_rem, GUPCR_BOUNCE_BUFFER_SIZE);
+      if ((gupcr_gmem_put_bb_used + n_xfer) > GUPCR_BOUNCE_BUFFER_SIZE)
+	gupcr_gmem_sync_puts ();
+      bounce_buf = &gupcr_gmem_put_bb[gupcr_gmem_put_bb_used];
+      gupcr_gmem_put_bb_used += n_xfer;
+      /* Read the source data into the bounce buffer.  */
+      gupcr_gmem_get (bounce_buf, sthread, src_addr, n_xfer);
+      gupcr_gmem_sync_gets ();
+      local_offset = bounce_buf - gupcr_gmem_put_bb;
+      ++gupcr_gmem_puts.num_pending;
+      gupcr_portals_call (PtlPut, (gupcr_gmem_put_bb_md, local_offset, n_xfer,
+				   PTL_ACK_REQ, dpid,
+				   GUPCR_PTL_PTE_GMEM, PTL_NO_MATCH_BITS,
+				   dest_addr, PTL_NULL_USER_PTR,
+				   PTL_NULL_HDR_DATA));
+      n_rem -= n_xfer;
+      src_addr += n_xfer;
+      dest_addr += n_xfer;
+    }
+}
+
+/**
+ * Write the same byte value into the bytes of the
+ * destination thread's memory at the specified offset.
+ *
+ * The put bounce buffer is used as an intermediate buffer.
+ * The last write of a chunk of data is non-blocking.
+ * Caller assumes responsibility for checking the validity
+ * of the remote thread id's and/or shared memory offsets.
+ *
+ * @param [in] thread Destination thread
+ * @param [in] offset Destination offset
+ * @param [in] c Set value
+ * @param [in] n Number of bytes to transfer
+ */
+void
+gupcr_gmem_set (int thread, size_t offset, int c, size_t n)
+{
+  size_t n_rem = n;
+  int already_filled = 0;
+  ptl_size_t dest_addr = offset;
+  ptl_process_t rpid;
+  gupcr_debug (FC_MEM, "0x%x %d:0x%lx %lu", c, thread,
+                       (long unsigned) offset, (long unsigned) n);
+  rpid.rank = thread;
+  while (n_rem > 0)
+    {
+      size_t n_xfer;
+      char *bounce_buf;
+      ptl_size_t local_offset;
+      /* Use the entire put "bounce buffer" if the transfer
+         count is sufficiently large.  */
+      n_xfer = GUPCR_MIN (n_rem, (size_t) GUPCR_BOUNCE_BUFFER_SIZE);
+      if ((gupcr_gmem_put_bb_used + n_xfer) > GUPCR_BOUNCE_BUFFER_SIZE)
+	gupcr_gmem_sync_puts ();
+      bounce_buf = &gupcr_gmem_put_bb[gupcr_gmem_put_bb_used];
+      gupcr_gmem_put_bb_used += n_xfer;
+      /* Fill the bounce buffer, if we haven't already.  */
+      if (!already_filled)
+	{
+	  memset (bounce_buf, c, n_xfer);
+	  already_filled = (bounce_buf == gupcr_gmem_put_bb
+			    && n_xfer == GUPCR_BOUNCE_BUFFER_SIZE);
+	}
+      local_offset = bounce_buf - gupcr_gmem_put_bb;
+      ++gupcr_gmem_puts.num_pending;
+      gupcr_portals_call (PtlPut, (gupcr_gmem_put_bb_md, local_offset, n_xfer,
+				   PTL_ACK_REQ, rpid,
+				   GUPCR_PTL_PTE_GMEM, PTL_NO_MATCH_BITS,
+				   dest_addr, PTL_NULL_USER_PTR,
+				   PTL_NULL_HDR_DATA));
+      n_rem -= n_xfer;
+      dest_addr += n_xfer;
+    }
+}
+
+/**
+ * Initialize gmem resources.
+ * @ingroup INIT
+ */
+void
+gupcr_gmem_init (void)
+{
+  ptl_md_t md, md_volatile;
+  ptl_le_t le;
+  ptl_pt_index_t pte;
+  gupcr_log (FC_MEM, "gmem init called");
+  /* Allocate memory for this thread's contribution to shared memory.  */
+  gupcr_gmem_alloc_shared ();
+  gupcr_portals_call (PtlPTAlloc,
+		      (gupcr_ptl_ni, 0,
+		       PTL_EQ_NONE, GUPCR_PTL_PTE_GMEM, &pte));
+  if (pte != GUPCR_PTL_PTE_GMEM)
+    gupcr_fatal_error ("cannot allocate PTE GUPCR_PTL_PTE_GMEM");
+  gupcr_log (FC_MEM, "Gmem PTE allocated: %d", GUPCR_PTL_PTE_GMEM);
+  /* Setup Gmem LE.  */
+  le.start = gupcr_gmem_base;
+  le.length = gupcr_gmem_size;
+  le.ct_handle = PTL_CT_NONE;
+  le.uid = PTL_UID_ANY;
+  le.options = PTL_LE_OP_PUT | PTL_LE_OP_GET;
+  gupcr_portals_call (PtlLEAppend,
+		      (gupcr_ptl_ni,
+		       GUPCR_PTL_PTE_GMEM, &le,
+		       PTL_PRIORITY_LIST, NULL, &gupcr_gmem_le));
+  gupcr_debug (FC_MEM, "Gmem LE created at 0x%lx with size 0x%lx)",
+	      (long unsigned) gupcr_gmem_base,
+	      (long unsigned) gupcr_gmem_size);
+  /* Initialize GMEM get lists */
+  gupcr_gmem_gets.num_pending = 0;
+  gupcr_gmem_gets.num_completed = 0;
+  gupcr_gmem_gets.md_options =
+    PTL_MD_EVENT_CT_REPLY | PTL_MD_EVENT_SUCCESS_DISABLE;
+  /* Allocate at least THREADS number of EQ entries.  */
+  gupcr_portals_call (PtlEQAlloc,
+		      (gupcr_ptl_ni, THREADS, &gupcr_gmem_gets.eq_handle));
+  gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_gmem_gets.ct_handle));
+  /* Map user's address space for GET operations.  */
+  md.length = (ptl_size_t) USER_PROG_MEM_SIZE;
+  md.start = (void *) USER_PROG_MEM_START;
+  md.options = gupcr_gmem_gets.md_options;
+  md.eq_handle = gupcr_gmem_gets.eq_handle;
+  md.ct_handle = gupcr_gmem_gets.ct_handle;
+  gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_gmem_gets.md));
+  /* Initialize GMEM put lists.  */
+  gupcr_gmem_puts.num_pending = 0;
+  gupcr_gmem_puts.num_completed = 0;
+  gupcr_gmem_puts.md_options =
+    PTL_MD_EVENT_CT_ACK | PTL_MD_EVENT_SUCCESS_DISABLE;
+  /* Allocate at least THREADS number of EQ entries.  */
+  gupcr_portals_call (PtlEQAlloc,
+		      (gupcr_ptl_ni, THREADS, &gupcr_gmem_puts.eq_handle));
+  gupcr_portals_call (PtlCTAlloc, (gupcr_ptl_ni, &gupcr_gmem_puts.ct_handle));
+  /* Map user's address space for PUT operations.  */
+  md.length = (ptl_size_t) USER_PROG_MEM_SIZE;
+  md.start = (void *) USER_PROG_MEM_START;
+  md.options = gupcr_gmem_puts.md_options;
+  md.eq_handle = gupcr_gmem_puts.eq_handle;
+  md.ct_handle = gupcr_gmem_puts.ct_handle;
+  gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_gmem_puts.md));
+  /* And map the same but with a volatile option.  */
+  md_volatile = md;
+  md_volatile.options |= PTL_MD_VOLATILE;
+  gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md_volatile,
+				  &gupcr_gmem_puts.md_volatile));
+  /* Initialize GMEM put bounce buffer.  */
+  md.length = GUPCR_BOUNCE_BUFFER_SIZE;
+  md.start = gupcr_gmem_put_bb;
+  md.options = gupcr_gmem_puts.md_options;
+  md.eq_handle = gupcr_gmem_puts.eq_handle;
+  md.ct_handle = gupcr_gmem_puts.ct_handle;
+  gupcr_portals_call (PtlMDBind, (gupcr_ptl_ni, &md, &gupcr_gmem_put_bb_md));
+}
+
+/**
+ * Release gmem resources.
+ * @ingroup INIT
+ */
+void
+gupcr_gmem_fini (void)
+{
+  gupcr_log (FC_MEM, "gmem fini called");
+  /* Release GET MD.  */
+  gupcr_portals_call (PtlMDRelease, (gupcr_gmem_gets.md));
+  gupcr_portals_call (PtlCTFree, (gupcr_gmem_gets.ct_handle));
+  gupcr_portals_call (PtlEQFree, (gupcr_gmem_gets.eq_handle));
+  /* Release PUT MDs.  */
+  gupcr_portals_call (PtlMDRelease, (gupcr_gmem_puts.md));
+  gupcr_portals_call (PtlMDRelease, (gupcr_gmem_put_bb_md));
+  gupcr_portals_call (PtlCTFree, (gupcr_gmem_puts.ct_handle));
+  gupcr_portals_call (PtlEQFree, (gupcr_gmem_puts.eq_handle));
+  /* Release LEs and PTEs.  */
+  gupcr_portals_call (PtlLEUnlink, (gupcr_gmem_le));
+  gupcr_portals_call (PtlPTFree, (gupcr_ptl_ni, GUPCR_PTL_PTE_GMEM));
+}
+
+/** @} */
Index: libgupc/portals4/gupcr_gmem.h
===================================================================
--- libgupc/portals4/gupcr_gmem.h	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_gmem.h	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,132 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime Library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef _GUPCR_GMEM_H_
+#define _GUPCR_GMEM_H_
+
+/**
+ * @file gupcr_gmem.h
+ * GUPC Portals4 shared memory interface.
+ */
+
+/**
+ * @addtogroup GMEM GUPCR Shared Memory Access
+ * @{
+ */
+
+/* Configuration-defined limits.  */
+/** Maximum size of the message that uses put bounce buffer.  */
+#define GUPCR_GMEM_MAX_SAFE_PUT_SIZE 1*KILOBYTE
+
+/** Max size of the user program.
+ *
+ * To simplify management of memory descriptors the entire user
+ * program address space is mapped into one memory descriptor per
+ * direction of the transfer.
+ * Per linux kernel document: Documentation/x86/x86_64/mm.txt
+ * the maximum size is 0x8000_0000_0000
+ */
+#define USER_PROG_MEM_SIZE  0x00008000000000000
+/** Beginning of the user program */
+#define USER_PROG_MEM_START NULL
+
+//begin lib_inline_gmem
+/** Check if shared memory of the specified thread can be accessed
+    as node local reference.  */
+#define GUPCR_GMEM_IS_LOCAL(thr) (gupcr_node_map[thr] != NULL)
+/** Convert pointer-to-shared address filed into local address.  */
+#define GUPCR_GMEM_OFF_TO_LOCAL(thr,off) (gupcr_node_map[thr] + off)
+
+/** GMEM shared memory base */
+extern void *gupcr_gmem_base;
+//end lib_inline_gmem
+
+/** GMEM shared memory size */
+extern ptl_size_t gupcr_gmem_size;
+
+/** GMEM get/put information tracking.
+ *
+ *  Track the information required to access global
+ *  memory in a given direction (get/put) using non-blocking
+ *  'get' and 'put' functions.
+ */
+typedef struct gupcr_gmem_xfer_info_struct
+{
+  /** Number of pending operations */
+  ptl_size_t num_pending;
+  /** Number of completed operations */
+  ptl_size_t num_completed;
+  /** Memory descriptor options */
+  unsigned int md_options;
+  /** Memory descriptor event handle */
+  ptl_handle_eq_t eq_handle;
+  /** Memory descriptor counting events handle */
+  ptl_handle_ct_t ct_handle;
+  /** Memory descriptor handle */
+  ptl_handle_md_t md;
+  /** Volatile memory descriptor handle */
+  ptl_handle_md_t md_volatile;
+} gupcr_gmem_xfer_info_t;
+/** GET/PUT information tracking pointer type */
+typedef gupcr_gmem_xfer_info_t *gupcr_gmem_xfer_info_p;
+
+/** GET transfer tracking */
+extern gupcr_gmem_xfer_info_t gupcr_gmem_gets;
+/** PUT transfer tracking */
+extern gupcr_gmem_xfer_info_t gupcr_gmem_puts;
+
+/** PUT "bounce buffer" bytes in use */
+extern size_t gupcr_gmem_put_bb_used;
+
+//begin lib_gmem
+extern void gupcr_gmem_sync (void);
+//end lib_gmem
+
+//begin lib_inline_gmem
+
+/** If TRUE, a strict PUT operation is pending */
+extern int gupcr_pending_strict_put;
+
+extern void gupcr_gmem_sync_gets (void);
+extern void gupcr_gmem_sync_puts (void);
+extern void gupcr_gmem_get (void *dest, int rthread, size_t roffset,
+			    size_t n);
+extern void gupcr_gmem_put (int rthread, size_t roffset, const void *src,
+			    size_t n);
+extern void gupcr_gmem_copy (int dthread, size_t doffset, int sthread,
+			     size_t soffset, size_t n);
+extern void gupcr_gmem_set (int dthread, size_t doffset, int c, size_t n);
+
+//end lib_inline_gmem
+
+extern size_t gupcr_gmem_heap_base_offset;
+extern size_t gupcr_gmem_heap_size;
+
+extern void gupcr_gmem_init (void);
+extern void gupcr_gmem_fini (void);
+
+/** @} */
+#endif /* gupcr_gmem.h */
Index: libgupc/portals4/gupcr_lib.h
===================================================================
--- libgupc/portals4/gupcr_lib.h	(.../trunk)	(revision 0)
+++ libgupc/portals4/gupcr_lib.h	(.../branches/gupc)	(revision 231080)
@@ -0,0 +1,72 @@
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
+   This file is part of the UPC runtime library.
+   Written by Gary Funck <gary@intrepid.com>
+   and Nenad Vukicevic <nenad@intrepid.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/**
+ * @file gupcr_lib.h
+ * GUPC Runtime definitions of user-visible UPC routines.
+ */
+
+#ifndef _GUPCR_LIB_H_
+#define _GUPCR_LIB_H_
+
+/* Definition of user-visible UPC library routines,
+   in a form that they can be called from the
+   "C"-based runtime.  */
+
+extern size_t upc_threadof (upc_shared_ptr_t);
+extern size_t upc_phaseof (upc_shared_ptr_t);
+extern upc_shared_ptr_t upc_resetphase (upc_shared_ptr_t);
+extern size_t upc_addrfield (upc_shared_ptr_t);
+extern size_t upc_affinitysize (size_t, size_t, size_t);
+
+extern void upc_global_exit (int);
+
+extern void upc_memcpy (upc_shared_ptr_t dest, upc_shared_ptr_t src,
+			size_t n);
+extern void upc_memget (void *dest, upc_shared_ptr_t src, size_t n);
+extern void upc_memput (upc_shared_ptr_t dest, const void *src, size_t n);
+extern void upc_memset (upc_shared_ptr_t dest, int c, size_t n);
+
+extern upc_shared_ptr_t upc_global_alloc (size_t, size_t);
+extern upc_shared_ptr_t upc_all_alloc (size_t, size_t);
+extern upc_shared_ptr_t upc_alloc (size_t);
+extern void upc_free (upc_shared_ptr_t);
+extern void upc_all_free (upc_shared_ptr_t);
+
+extern upc_shared_ptr_t upc_lock_alloc (void);
+extern void upc_lock_free (upc_shared_ptr_t);
+extern void upc_all_lock_free (upc_shared_ptr_t);
+extern upc_shared_ptr_t upc_all_lock_alloc (void);
+extern upc_shared_ptr_t upc_global_lock_alloc (void);
+extern void upc_lock (upc_shared_ptr_t);
+extern int upc_lock_attempt (upc_shared_ptr_t);
+extern void upc_unlock (upc_shared_ptr_t);
+
+typedef uint64_t upc_tick_t;
+extern upc_tick_t upc_ticks_now (void);
+extern uint64_t upc_ticks_to_ns (upc_tick_t ticks);
+
+#endif /* gupcr_lib.h */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]