This is the mail archive of the libstdc++@gcc.gnu.org mailing list for the libstdc++ project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH][libstdc++-v3 parallel mode] PR 33893 work correctly even if omp_get_dynamic()


> Johannes Singler wrote:
>> I'm aware of that, and got a pointer to a detailed instruction on how to
>> do that by Benjamin Kosnik only recently. I already tried to format the
>> changed parts according to the guidelines.
> Sorry, frankly this is not true: there are obviously many lines wider
> than 80 columns (remember, by the way, that it means 80 overall, CR
> included, therefore from 0 to 78 of actual code)

Well, I meant the code parts that were actually changed. Anyway, now I
reformatted all concerned files. I will process the other files later.
New patch attached.



Major revision of the parallel mode algorithms, to work correctly even
if omp_get_dynamic(), PR 33893.
Also, the quicksort variants work correctly if !omp_get_nested().
However, the speedup is limited to 2 in this case. Getting rid of this
limitation is very elaborate since OpenMP 2.5 does not support barriers
among a subset of a team. Maybe things get better with OpenMP 3.0, I
still have to look into this.

Because the process included a lot of formatting cleanup, I'm also
attaching a version of the patch ignoring white space, for better
readability.

Please approve.

tested x86_64-unknown-linux-gnu: no regressions

2007-11-21  Johannes Singler  <singler@ira.uka.de>

       *include/parallel/multiway_merge.h: made omp_dynamic-safe
       *include/parallel/workstealing.h: made omp_dynamic-safe
       *include/parallel/base.h: infrastructure, cleanup
       *include/parallel/par_loop.h: made omp_dynamic-safe
       *include/parallel/features.h: activate loser tree variant
       *include/parallel/quicksort.h: made omp_dynamic-safe
       *include/parallel/compiletime_settings.h: settings overridable
       *include/parallel/equally_split.h: made omp_dynamic-safe
       *include/parallel/omp_loop_static.h: made omp_dynamic-safe
       *include/parallel/random_shuffle.h: made omp_dynamic-safe
       *include/parallel/balanced_quicksort.h: made omp_dynamic-safe
       *include/parallel/set_operations.h: made omp_dynamic-safe
       *include/parallel/unique_copy.h: made omp_dynamic-safe
       *include/parallel/multiway_mergesort.h: made omp_dynamic-safe
       *include/parallel/search.h: made omp_dynamic-safe
       *include/parallel/partition.h: made omp_dynamic-safe
       *include/parallel/partial_sum.h: made omp_dynamic-safe
       *include/parallel/find.h: made omp_dynamic-safe
       *include/parallel/omp_loop.h: made omp_dynamic-safe
       *include/parallel/losertree.h: avoid default constructor

Johannes



Index: include/parallel/multiway_merge.h
===================================================================
--- include/parallel/multiway_merge.h	(revision 130225)
+++ include/parallel/multiway_merge.h	(working copy)
@@ -29,16 +29,16 @@
 // Public License.
 
 /** @file parallel/multiway_merge.h
- *  @brief Implementation of sequential and parallel multiway merge.
- *
- *  Explanations on the high-speed merging routines in the appendix of
- *
- *  P. Sanders.
- *  Fast priority queues for cached memory.
- *  ACM Journal of Experimental Algorithmics, 5, 2000.
- *
- *  This file is a GNU parallel extension to the Standard C++ Library.
- */
+*  @brief Implementation of sequential and parallel multiway merge.
+*
+*  Explanations on the high-speed merging routines in the appendix of
+*
+*  P. Sanders.
+*  Fast priority queues for cached memory.
+*  ACM Journal of Experimental Algorithmics, 5, 2000.
+*
+*  This file is a GNU parallel extension to the Standard C++ Library.
+*/
 
 // Written by Johannes Singler.
 
@@ -62,15 +62,15 @@
 // XXX need iterator typedefs
 namespace __gnu_parallel
 {
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   class guarded_iterator;
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
 	    guarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
 	     guarded_iterator<RandomAccessIterator, Comparator>& bi2);
@@ -80,7 +80,7 @@
       *  Deriving from RandomAccessIterator is not possible since
       *  RandomAccessIterator need not be a class.
       */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   class guarded_iterator
   {
   private:
@@ -99,8 +99,8 @@
      *  @param end End iterator of sequence.
      *  @param comp Comparator provided for associated overloaded
      *  compare operators. */
-    inline guarded_iterator(RandomAccessIterator begin, 
-			    RandomAccessIterator end, Comparator& comp) 
+    inline guarded_iterator(RandomAccessIterator begin,
+                            RandomAccessIterator end, Comparator& comp)
     : current(begin), end(end), comp(comp)
     { }
 
@@ -125,17 +125,21 @@
     { return current; }
 
     friend bool
-    operator< <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator< <RandomAccessIterator, Comparator>(
+        guarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        guarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
     friend bool
-    operator<= <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator<= <RandomAccessIterator, Comparator>(
+        guarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        guarded_iterator<RandomAccessIterator, Comparator>& bi2);
   };
 
-  /** @brief Compare two elements referenced by guarded iterators.
+/** @brief Compare two elements referenced by guarded iterators.
    *  @param bi1 First iterator.
    *  @param bi2 Second iterator.
    *  @return @c True if less. */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
 	    guarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -147,11 +151,11 @@
     return (bi1.comp)(*bi1, *bi2);	//normal compare
   }
 
-  /** @brief Compare two elements referenced by guarded iterators.
+/** @brief Compare two elements referenced by guarded iterators.
    *  @param bi1 First iterator.
    *  @param bi2 Second iterator.
    *  @return @c True if less equal. */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
 	     guarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -163,20 +167,20 @@
     return !(bi1.comp)(*bi2, *bi1);	//normal compare
   }
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   class unguarded_iterator;
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
 	    unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
 	     unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   class unguarded_iterator
   {
   private:
@@ -190,8 +194,8 @@
      *  @param begin Begin iterator of sequence.
      *  @param end Unused, only for compatibility.
      *  @param comp Unused, only for compatibility. */
-    inline unguarded_iterator(RandomAccessIterator begin, 
-			      RandomAccessIterator end, Comparator& comp) 
+    inline unguarded_iterator(RandomAccessIterator begin,
+                              RandomAccessIterator end, Comparator& comp)
     : current(begin), comp(comp)
     { }
 
@@ -206,7 +210,7 @@
 
     /** @brief Dereference operator.
      *  @return Referenced element. */
-    inline typename std::iterator_traits<RandomAccessIterator>::value_type 
+    inline typename std::iterator_traits<RandomAccessIterator>::value_type
     operator*()
     { return *current; }
 
@@ -217,17 +221,21 @@
     { return current; }
 
     friend bool
-    operator< <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator< <RandomAccessIterator, Comparator>(
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
     friend bool
-    operator<= <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
+    operator<= <RandomAccessIterator, Comparator>(
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
+        unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
   };
 
-  /** @brief Compare two elements referenced by unguarded iterators.
+/** @brief Compare two elements referenced by unguarded iterators.
    *  @param bi1 First iterator.
    *  @param bi2 Second iterator.
    *  @return @c True if less. */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
 	    unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -236,11 +244,11 @@
     return (bi1.comp)(*bi1, *bi2);
   }
 
-  /** @brief Compare two elements referenced by unguarded iterators.
+/** @brief Compare two elements referenced by unguarded iterators.
    *  @param bi1 First iterator.
    *  @param bi2 Second iterator.
    *  @return @c True if less equal. */
-  template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
 	     unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -249,26 +257,30 @@
     return !(bi1.comp)(*bi2, *bi1);
   }
 
-  /** Prepare a set of sequences to be merged without a (end) guard
+/** Prepare a set of sequences to be merged without a (end) guard
    *  @param seqs_begin
    *  @param seqs_end
    *  @param comp
    *  @param min_sequence
    *  @param stable
    *  @pre (seqs_end - seqs_begin > 0) */
-  template<typename RandomAccessIteratorIterator, typename Comparator>
-  typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type
+template<typename RandomAccessIteratorIterator, typename Comparator>
+  typename std::iterator_traits<
+      typename std::iterator_traits<RandomAccessIteratorIterator>::value_type
+      ::first_type>::difference_type
   prepare_unguarded(RandomAccessIteratorIterator seqs_begin,
 		    RandomAccessIteratorIterator seqs_end, Comparator comp,
 		    int& min_sequence, bool stable)
   {
     _GLIBCXX_CALL(seqs_end - seqs_begin)
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+        ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type
+    typedef typename std::iterator_traits<RandomAccessIterator1>
+      ::difference_type
       difference_type;
 
     if ((*seqs_begin).first == (*seqs_begin).second)
@@ -317,7 +329,8 @@
 
     for (; s < (seqs_end - seqs_begin); s++)
       {
-	RandomAccessIterator1 split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second, min, comp);
+        RandomAccessIterator1 split = std::lower_bound(
+            seqs_begin[s].first, seqs_begin[s].second, min, comp);
 	overhang_size += seqs_begin[s].second - split;
       }
 
@@ -325,23 +338,27 @@
     return overhang_size;
   }
 
-  /** Prepare a set of sequences to be merged with a (end) guard (sentinel)
+/** Prepare a set of sequences to be merged with a (end) guard (sentinel)
    *  @param seqs_begin
    *  @param seqs_end
    *  @param comp */
-  template<typename RandomAccessIteratorIterator, typename Comparator>
-  typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type
+template<typename RandomAccessIteratorIterator, typename Comparator>
+  typename std::iterator_traits<typename std::iterator_traits<
+      RandomAccessIteratorIterator>::value_type::first_type>::difference_type
   prepare_unguarded_sentinel(RandomAccessIteratorIterator seqs_begin,
 			     RandomAccessIteratorIterator seqs_end,
 			     Comparator comp)
   {
     _GLIBCXX_CALL(seqs_end - seqs_begin)
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
+    typedef typename std::iterator_traits<RandomAccessIterator1>
+      ::value_type
       value_type;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type
+    typedef typename std::iterator_traits<RandomAccessIterator1>
+      ::difference_type
       difference_type;
 
     // Last element in sequence.
@@ -362,8 +379,8 @@
     difference_type overhang_size = 0;
     for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
       {
-	RandomAccessIterator1 split = std::lower_bound((*s).first, (*s).second,
-						       *max, comp);
+        RandomAccessIterator1 split =
+            std::lower_bound((*s).first, (*s).second, *max, comp);
 	overhang_size += (*s).second - split;
 
 	// Set sentinel.
@@ -374,7 +391,7 @@
     return overhang_size;
   }
 
-  /** @brief Highly efficient 3-way merging procedure.
+/** @brief Highly efficient 3-way merging procedure.
    *  @param seqs_begin Begin iterator of iterator pair input sequence.
    *  @param seqs_end End iterator of iterator pair input sequence.
    *  @param target Begin iterator out output sequence.
@@ -382,15 +399,25 @@
    *  @param length Maximum length to merge.
    *  @param stable Unused, stable anyway.
    *  @return End iterator of output sequence. */
-  template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    template<typename RAI, typename C> class iterator,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_3_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_3_variant(
+      RandomAccessIteratorIterator seqs_begin,
+      RandomAccessIteratorIterator seqs_end,
+      RandomAccessIterator3 target,
+      Comparator comp, _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length);
-    
+
     typedef _DifferenceTp difference_type;
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -456,14 +483,23 @@
     return target;
   }
 
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin,
+                            RandomAccessIteratorIterator seqs_end,
+                            RandomAccessIterator3 target,
+                            Comparator comp,
+                            _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length);
-    
+
     typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -472,7 +508,8 @@
     RandomAccessIterator3 target_end;
 
     // Stable anyway.
-    difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
+    difference_type overhang =
+        prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
 
     difference_type total_length = 0;
     for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
@@ -480,7 +517,8 @@
 
     if (overhang != -1)
       {
-	difference_type unguarded_length = std::min(length, total_length - overhang);
+        difference_type unguarded_length =
+            std::min(length, total_length - overhang);
 	target_end = multiway_merge_3_variant<unguarded_iterator>
 	  (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
 	overhang = length - unguarded_length;
@@ -527,7 +565,7 @@
     return target_end;
   }
 
-  /** @brief Highly efficient 4-way merging procedure.
+/** @brief Highly efficient 4-way merging procedure.
    *  @param seqs_begin Begin iterator of iterator pair input sequence.
    *  @param seqs_end End iterator of iterator pair input sequence.
    *  @param target Begin iterator out output sequence.
@@ -535,14 +573,23 @@
    *  @param length Maximum length to merge.
    *  @param stable Unused, stable anyway.
    *  @return End iterator of output sequence. */
-  template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    template<typename RAI, typename C> class iterator,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin,
+                           RandomAccessIteratorIterator seqs_end,
+                           RandomAccessIterator3 target,
+                           Comparator comp, _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length);
     typedef _DifferenceTp difference_type;
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -633,14 +680,23 @@
     return target;
   }
 
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin,
+                            RandomAccessIteratorIterator seqs_end,
+                            RandomAccessIterator3 target,
+                            Comparator comp,
+                            _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length);
     typedef _DifferenceTp difference_type;
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -649,7 +705,8 @@
     RandomAccessIterator3 target_end;
 
     // Stable anyway.
-    difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
+    difference_type overhang =
+        prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
 
     difference_type total_length = 0;
     for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
@@ -657,7 +714,8 @@
 
     if (overhang != -1)
       {
-	difference_type unguarded_length = std::min(length, total_length - overhang);
+        difference_type unguarded_length =
+            std::min(length, total_length - overhang);
 	target_end = multiway_merge_4_variant<unguarded_iterator>
 	  (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
 	overhang = length - unguarded_length;
@@ -674,10 +732,13 @@
     _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
 #endif
 
-    std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > one_missing(seqs_begin, seqs_end);
+    std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> >
+        one_missing(seqs_begin, seqs_end);
     one_missing.erase(one_missing.begin() + min_seq);	//remove
 
-    target_end = multiway_merge_3_variant<guarded_iterator>(one_missing.begin(), one_missing.end(), target_end, comp, overhang, stable);
+    target_end = multiway_merge_3_variant<guarded_iterator>(
+        one_missing.begin(), one_missing.end(),
+        target_end, comp, overhang, stable);
 
     // Insert back again.
     one_missing.insert(one_missing.begin() + min_seq, seqs_begin[min_seq]);
@@ -692,7 +753,7 @@
     return target_end;
   }
 
-  /** @brief Basic multi-way merging procedure.
+/** @brief Basic multi-way merging procedure.
    *
    *  The head elements are kept in a sorted array, new heads are
    *  inserted linearly.
@@ -702,16 +763,24 @@
    *  @param comp Comparator.
    *  @param length Maximum length to merge.
    *  @param stable Stable merging incurs a performance penalty.
-   *  @return End iterator of output sequence. 
+ *  @return End iterator of output sequence.
    */
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin,
+                        RandomAccessIteratorIterator seqs_end,
+                        RandomAccessIterator3 target,
+                        Comparator comp, _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length)
 
     typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -719,7 +788,8 @@
     // Num remaining pieces.
     int k = static_cast<int>(seqs_end - seqs_begin), nrp;
 
-    value_type* pl = static_cast<value_type*>(::operator new(sizeof(value_type) * k));
+    value_type* pl = static_cast<value_type*>(
+      ::operator new(sizeof(value_type) * k));
     int* source = new int[k];
     difference_type total_length = 0;
 
@@ -818,7 +888,8 @@
 	    // Sink down.
 	    j = 1;
 	    while ((j < nrp) && (comp(pl[j], pl[j - 1]) ||
-				 (!comp(pl[j - 1], pl[j]) && (source[j] < source[j - 1]))))
+                                (!comp(pl[j - 1], pl[j])
+                                    && (source[j] < source[j - 1]))))
 	      {
 		std::swap(pl[j - 1], pl[j]);
 		std::swap(source[j - 1], source[j]);
@@ -869,7 +940,7 @@
     return target;
   }
 
-  /** @brief Multi-way merging procedure for a high branching factor,
+/** @brief Multi-way merging procedure for a high branching factor,
    * guarded case.
    *
    *  The head elements are kept in a loser tree.
@@ -879,16 +950,26 @@
    *  @param comp Comparator.
    *  @param length Maximum length to merge.
    *  @param stable Stable merging incurs a performance penalty.
-   *  @return End iterator of output sequence. 
+ *  @return End iterator of output sequence.
    */
-  template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename LT,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin,
+                            RandomAccessIteratorIterator seqs_end,
+                            RandomAccessIterator3 target,
+                            Comparator comp,
+                            _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length)
 
     typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -978,7 +1059,7 @@
     return target;
   }
 
-  /** @brief Multi-way merging procedure for a high branching factor,
+/** @brief Multi-way merging procedure for a high branching factor,
    * unguarded case.
    *
    *  The head elements are kept in a loser tree.
@@ -989,16 +1070,25 @@
    *  @param length Maximum length to merge.
    *  @param stable Stable merging incurs a performance penalty.
    *  @return End iterator of output sequence.
-   *  @pre No input will run out of elements during the merge. 
+ *  @pre No input will run out of elements during the merge.
    */
-  template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename LT,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp, typename Comparator>
   RandomAccessIterator3
-  multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin,
+                                      RandomAccessIteratorIterator seqs_end,
+                                      RandomAccessIterator3 target,
+                                      Comparator comp,
+                                      _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length)
     typedef _DifferenceTp difference_type;
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -1045,13 +1135,16 @@
 	    source = lt.get_min_source();
 
 #if _GLIBCXX_ASSERTIONS
-	    _GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1)));
+            _GLIBCXX_PARALLEL_ASSERT(i == 0
+                || !comp(*(seqs_begin[source].first), *(target - 1)));
 #endif
 
 	    *(target++) = *(seqs_begin[source].first++);
 
 #if _GLIBCXX_ASSERTIONS
-	    _GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i == length - 1));
+            _GLIBCXX_PARALLEL_ASSERT(
+                (seqs_begin[source].first != seqs_begin[source].second)
+                || (i == length - 1));
 	    i++;
 #endif
 	    // Feed.
@@ -1071,15 +1164,19 @@
 #if _GLIBCXX_ASSERTIONS
 	    if (i > 0 && comp(*(seqs_begin[source].first), *(target - 1)))
 	      printf("         %i %i %i\n", length, i, source);
-	    _GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1)));
+            _GLIBCXX_PARALLEL_ASSERT(i == 0
+                || !comp(*(seqs_begin[source].first), *(target - 1)));
 #endif
 
 	    *(target++) = *(seqs_begin[source].first++);
 
 #if _GLIBCXX_ASSERTIONS
-	    if (!((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1)))
+            if (!((seqs_begin[source].first != seqs_begin[source].second)
+                || (i >= length - 1)))
 	      printf("         %i %i %i\n", length, i, source);
-	    _GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1));
+            _GLIBCXX_PARALLEL_ASSERT(
+                (seqs_begin[source].first != seqs_begin[source].second)
+                || (i >= length - 1));
 	    i++;
 #endif
 	    // Feed.
@@ -1091,15 +1188,24 @@
     return target;
   }
 
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin,
+                                     RandomAccessIteratorIterator seqs_end,
+                                     RandomAccessIterator3 target,
+                                     Comparator comp,
+                                     _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length)
 
     typedef _DifferenceTp difference_type;
 
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -1115,7 +1221,8 @@
 
     if (overhang != -1)
       {
-	difference_type unguarded_length = std::min(length, total_length - overhang);
+        difference_type unguarded_length =
+            std::min(length, total_length - overhang);
 	target_end = multiway_merge_loser_tree_unguarded
 	  <typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
 	  (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
@@ -1145,23 +1252,31 @@
     return target_end;
   }
 
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin,
+                                     RandomAccessIteratorIterator seqs_end,
+                                      RandomAccessIterator3 target,
+                                      Comparator comp,
+                                      _DifferenceTp length, bool stable)
   {
     _GLIBCXX_CALL(length)
 
     typedef _DifferenceTp difference_type;
     typedef std::iterator_traits<RandomAccessIteratorIterator> traits_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
-      RandomAccessIterator1;
 
     RandomAccessIterator3 target_end;
-    difference_type overhang = prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);
+    difference_type overhang =
+        prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);
 
     difference_type total_length = 0;
     for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
@@ -1172,7 +1287,8 @@
 	(*s).second++;
       }
 
-    difference_type unguarded_length = std::min(length, total_length - overhang);
+    difference_type unguarded_length =
+        std::min(length, total_length - overhang);
     target_end = multiway_merge_loser_tree_unguarded
       <typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
       (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
@@ -1184,12 +1300,15 @@
 #endif
 
     // Copy rest stable.
-    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end && overhang > 0; s++)
+    for (RandomAccessIteratorIterator s = seqs_begin;
+         s != seqs_end && overhang > 0; s++)
       {
 	// Restore.
 	(*s).second--;
-	difference_type local_length = std::min((difference_type)overhang, (difference_type)LENGTH(*s));
-	target_end = std::copy((*s).first, (*s).first + local_length, target_end);
+        difference_type local_length =
+            std::min<difference_type>(overhang, LENGTH(*s));
+        target_end = std::copy((*s).first, (*s).first + local_length,
+                               target_end);
 	(*s).first += local_length;
 	overhang -= local_length;
       }
@@ -1203,7 +1322,7 @@
     return target_end;
   }
 
-  /** @brief Sequential multi-way merging switch.
+/** @brief Sequential multi-way merging switch.
    *
    *  The decision if based on the branching factor and runtime settings.
    *  @param seqs_begin Begin iterator of iterator pair input sequence.
@@ -1214,14 +1333,24 @@
    *  @param stable Stable merging incurs a performance penalty.
    *  @param sentinel The sequences have a sentinel element.
    *  @return End iterator of output sequence. */
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel, sequential_tag)
+  multiway_merge(RandomAccessIteratorIterator seqs_begin,
+                 RandomAccessIteratorIterator seqs_end,
+                 RandomAccessIterator3 target,
+                 Comparator comp, _DifferenceTp length,
+                 bool stable, bool sentinel,
+                 sequential_tag)
   {
     _GLIBCXX_CALL(length)
 
     typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+      ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
@@ -1234,7 +1363,8 @@
     RandomAccessIterator3 return_target = target;
     int k = static_cast<int>(seqs_end - seqs_begin);
 
-    Settings::MultiwayMergeAlgorithm mwma = Settings::multiway_merge_algorithm;
+    Settings::MultiwayMergeAlgorithm mwma =
+        Settings::multiway_merge_algorithm;
 
     if (!sentinel && mwma == Settings::LOSER_TREE_SENTINEL)
       mwma = Settings::LOSER_TREE_COMBINED;
@@ -1244,23 +1374,40 @@
       case 0:
 	break;
       case 1:
-	return_target = std::copy(seqs_begin[0].first, seqs_begin[0].first + length, target);
+        return_target = std::copy(seqs_begin[0].first,
+                                  seqs_begin[0].first + length,
+                                  target);
 	seqs_begin[0].first += length;
 	break;
       case 2:
-	return_target = merge_advance(seqs_begin[0].first, seqs_begin[0].second, seqs_begin[1].first, seqs_begin[1].second, target, length, comp);
+        return_target = merge_advance(seqs_begin[0].first,
+                                      seqs_begin[0].second,
+                                      seqs_begin[1].first,
+                                      seqs_begin[1].second,
+                                      target, length, comp);
 	break;
       case 3:
 	switch (mwma)
 	  {
 	  case Settings::LOSER_TREE_COMBINED:
-	    return_target = multiway_merge_3_combined(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_3_combined(seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
 	    break;
 	  case Settings::LOSER_TREE_SENTINEL:
-	    return_target = multiway_merge_3_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_3_variant<unguarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
 	    break;
 	  default:
-	    return_target = multiway_merge_3_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_3_variant<guarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
 	    break;
 	  }
 	break;
@@ -1268,13 +1415,25 @@
 	switch (mwma)
 	  {
 	  case Settings::LOSER_TREE_COMBINED:
-	    return_target = multiway_merge_4_combined(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_4_combined(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
 	    break;
 	  case Settings::LOSER_TREE_SENTINEL:
-	    return_target = multiway_merge_4_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_4_variant<unguarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
 	    break;
 	  default:
-	    return_target = multiway_merge_4_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+            return_target = multiway_merge_4_variant<guarded_iterator>(
+                seqs_begin,
+                seqs_end,
+                target,
+                comp, length, stable);
 	    break;
 	  }
 	break;
@@ -1283,26 +1442,48 @@
 	  switch (mwma)
 	    {
 	    case Settings::BUBBLE:
-	      return_target = multiway_merge_bubble(seqs_begin, seqs_end, target, comp, length, stable);
+              return_target = multiway_merge_bubble(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
 	      break;
 #if _GLIBCXX_LOSER_TREE_EXPLICIT
 	    case Settings::LOSER_TREE_EXPLICIT:
-	      return_target = multiway_merge_loser_tree<LoserTreeExplicit<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable);
+              return_target = multiway_merge_loser_tree<
+                    LoserTreeExplicit<value_type, Comparator> >(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
 	      break;
 #endif
 #if _GLIBCXX_LOSER_TREE
 	    case Settings::LOSER_TREE:
-	      return_target = multiway_merge_loser_tree<LoserTree<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable);
+              return_target = multiway_merge_loser_tree<
+                    LoserTree<value_type, Comparator> >(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
 	      break;
 #endif
 #if _GLIBCXX_LOSER_TREE_COMBINED
 	    case Settings::LOSER_TREE_COMBINED:
-	      return_target = multiway_merge_loser_tree_combined(seqs_begin, seqs_end, target, comp, length, stable);
+              return_target = multiway_merge_loser_tree_combined(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
 	      break;
 #endif
 #if _GLIBCXX_LOSER_TREE_SENTINEL
 	    case Settings::LOSER_TREE_SENTINEL:
-	      return_target = multiway_merge_loser_tree_sentinel(seqs_begin, seqs_end, target, comp, length, stable);
+              return_target = multiway_merge_loser_tree_sentinel(
+                  seqs_begin,
+                  seqs_end,
+                  target,
+                  comp, length, stable);
 	      break;
 #endif
 	    default:
@@ -1319,7 +1500,7 @@
     return return_target;
   }
 
-  /** @brief Parallel multi-way merge routine.
+/** @brief Parallel multi-way merge routine.
    *
    *  The decision if based on the branching factor and runtime settings.
    *  @param seqs_begin Begin iterator of iterator pair input sequence.
@@ -1329,30 +1510,35 @@
    *  @param length Maximum length to merge.
    *  @param stable Stable merging incurs a performance penalty.
    *  @param sentinel Ignored.
-   *  @return End iterator of output sequence. 
+ *  @return End iterator of output sequence.
    */
-  template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel)
+  parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin,
+                          RandomAccessIteratorIterator seqs_end,
+                           RandomAccessIterator3 target,
+                           Comparator comp,
+                           _DifferenceTp length, bool stable, bool sentinel)
   {
     _GLIBCXX_CALL(length)
 
     typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+      typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+        ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
 
-#if _GLIBCXX_ASSERTIONS
-    for (RandomAccessIteratorIterator rii = seqs_begin; rii != seqs_end; rii++)
-      _GLIBCXX_PARALLEL_ASSERT(is_sorted((*rii).first, (*rii).second, comp));
-#endif
-
     // k sequences.
     int k = static_cast<int>(seqs_end - seqs_begin);
 
     difference_type total_length = 0;
-    for (RandomAccessIteratorIterator raii = seqs_begin; raii != seqs_end; raii++)
+      for (RandomAccessIteratorIterator raii = seqs_begin;
+           raii != seqs_end; raii++)
       total_length += LENGTH(*raii);
 
     _GLIBCXX_CALL(total_length)
@@ -1360,32 +1546,50 @@
     if (total_length == 0 || k == 0)
       return target;
 
-    thread_index_t num_threads = static_cast<thread_index_t>(std::min(static_cast<difference_type>(get_max_threads()), total_length));
-
     bool tight = (total_length == length);
 
+      std::vector<std::pair<difference_type, difference_type> >* pieces;
+
+      thread_index_t num_threads = static_cast<thread_index_t>(
+          std::min<difference_type>(get_max_threads(), total_length));
+
+#     pragma omp parallel num_threads (num_threads)
+        {
+#         pragma omp single
+            {
+              num_threads = omp_get_num_threads();
     // Thread t will have to merge pieces[iam][0..k - 1]
-    std::vector<std::pair<difference_type, difference_type> >* pieces = new std::vector<std::pair<difference_type, difference_type> >[num_threads];
+              pieces = new std::vector<
+                  std::pair<difference_type, difference_type> >[num_threads];
     for (int s = 0; s < num_threads; s++)
       pieces[s].resize(k);
 
-    difference_type num_samples = Settings::merge_oversampling * num_threads;
+              difference_type num_samples =
+                  Settings::merge_oversampling * num_threads;
 
     if (Settings::multiway_merge_splitting == Settings::SAMPLING)
       {
-	value_type* samples = static_cast<value_type*>(::operator new(sizeof(value_type) * k * num_samples));
+                  value_type* samples = static_cast<value_type*>(
+                    ::operator new(sizeof(value_type) * k * num_samples));
 	// Sample.
 	for (int s = 0; s < k; s++)
 	  for (int i = 0; (difference_type)i < num_samples; i++)
 	    {
-	      difference_type sample_index = static_cast<difference_type>(LENGTH(seqs_begin[s]) * (double(i + 1) / (num_samples + 1)) * (double(length) / total_length));
-	      samples[s * num_samples + i] = seqs_begin[s].first[sample_index];
+                        difference_type sample_index =
+                            static_cast<difference_type>(
+                                LENGTH(seqs_begin[s]) * (double(i + 1) /
+                                (num_samples + 1)) * (double(length)
+                                / total_length));
+                        samples[s * num_samples + i] =
+                            seqs_begin[s].first[sample_index];
 	    }
 
 	if (stable)
-	  __gnu_sequential::stable_sort(samples, samples + (num_samples * k), comp);
+                    __gnu_sequential::stable_sort(
+                      samples, samples + (num_samples * k), comp);
 	else
-	  __gnu_sequential::sort(samples, samples + (num_samples * k), comp);
+                    __gnu_sequential::sort(
+                      samples, samples + (num_samples * k), comp);
 
 	for (int slab = 0; slab < num_threads; slab++)
 	  // For each slab / processor.
@@ -1393,42 +1597,59 @@
 	    {
 	      // For each sequence.
 	      if (slab > 0)
-		pieces[slab][seq].first = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * slab / num_threads], comp) - seqs_begin[seq].first;
+                          pieces[slab][seq].first =
+                              std::upper_bound(
+                                seqs_begin[seq].first,
+                                seqs_begin[seq].second,
+                                samples[num_samples * k * slab / num_threads],
+                                  comp)
+                              - seqs_begin[seq].first;
 	      else
 		{
 		  // Absolute beginning.
 		  pieces[slab][seq].first = 0;
 		}
 	      if ((slab + 1) < num_threads)
-		pieces[slab][seq].second = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * (slab + 1) / num_threads], comp) - seqs_begin[seq].first;
+                          pieces[slab][seq].second =
+                              std::upper_bound(
+                                  seqs_begin[seq].first,
+                                  seqs_begin[seq].second,
+                                  samples[num_samples * k * (slab + 1) /
+                                      num_threads], comp)
+                              - seqs_begin[seq].first;
 	      else
-		pieces[slab][seq].second = LENGTH(seqs_begin[seq]);	//absolute ending
+                        pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
 	    }
 	delete[] samples;
       }
     else
       {
 	// (Settings::multiway_merge_splitting == Settings::EXACT).
-	std::vector<RandomAccessIterator1>* offsets = new std::vector<RandomAccessIterator1>[num_threads];
-	std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > se(k);
+                  std::vector<RandomAccessIterator1>* offsets =
+                      new std::vector<RandomAccessIterator1>[num_threads];
+                  std::vector<
+                      std::pair<RandomAccessIterator1, RandomAccessIterator1>
+                      > se(k);
 
 	copy(seqs_begin, seqs_end, se.begin());
 
-	difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1)));
+                  difference_type* borders =
+                      new difference_type[num_threads + 1];
 	equally_split(length, num_threads, borders);
 
 	for (int s = 0; s < (num_threads - 1); s++)
 	  {
 	    offsets[s].resize(k);
-	    multiseq_partition(se.begin(), se.end(), borders[s + 1],
+                      multiseq_partition(
+                          se.begin(), se.end(), borders[s + 1],
 			       offsets[s].begin(), comp);
 
 	    // Last one also needed and available.
 	    if (!tight)
 	      {
 		offsets[num_threads - 1].resize(k);
-		multiseq_partition(se.begin(), se.end(), 
-				   difference_type(length), 
+                          multiseq_partition(se.begin(), se.end(),
+                                difference_type(length),
 				   offsets[num_threads - 1].begin(),  comp);
 	      }
 	  }
@@ -1446,21 +1667,23 @@
 		    pieces[slab][seq].first = 0;
 		  }
 		else
-		  pieces[slab][seq].first = pieces[slab - 1][seq].second;
+                            pieces[slab][seq].first =
+                                pieces[slab - 1][seq].second;
 		if (!tight || slab < (num_threads - 1))
-		  pieces[slab][seq].second = offsets[slab][seq] - seqs_begin[seq].first;
+                            pieces[slab][seq].second =
+                                offsets[slab][seq] - seqs_begin[seq].first;
 		else
 		  {
 		    // slab == num_threads - 1
-		    pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
+                              pieces[slab][seq].second =
+                                  LENGTH(seqs_begin[seq]);
 		  }
 	      }
 	  }
 	delete[] offsets;
       }
+            } //single
 
-#	pragma omp parallel num_threads(num_threads)
-    {
       thread_index_t iam = omp_get_thread_num();
 
       difference_type target_position = 0;
@@ -1470,16 +1693,21 @@
 
       if (k > 2)
 	{
-	  std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks = new std::pair<RandomAccessIterator1, RandomAccessIterator1>[k];
+              std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks
+                = new
+                  std::pair<RandomAccessIterator1, RandomAccessIterator1>[k];
 
 	  difference_type local_length = 0;
 	  for (int s = 0; s < k; s++)
 	    {
-	      chunks[s] = std::make_pair(seqs_begin[s].first + pieces[iam][s].first, seqs_begin[s].first + pieces[iam][s].second);
+                  chunks[s] = std::make_pair(
+                      seqs_begin[s].first + pieces[iam][s].first,
+                      seqs_begin[s].first + pieces[iam][s].second);
 	      local_length += LENGTH(chunks[s]);
 	    }
 
-	  multiway_merge(chunks, chunks + k, target + target_position, comp,
+              multiway_merge(
+                    chunks, chunks + k, target + target_position, comp,
 			 std::min(local_length, length - target_position),
 			 stable, false, sequential_tag());
 
@@ -1487,16 +1715,19 @@
 	}
       else if (k == 2)
 	{
-	  RandomAccessIterator1 begin0 = seqs_begin[0].first + pieces[iam][0].first, begin1 = seqs_begin[1].first + pieces[iam][1].first;
+              RandomAccessIterator1
+                  begin0 = seqs_begin[0].first + pieces[iam][0].first,
+                  begin1 = seqs_begin[1].first + pieces[iam][1].first;
 	  merge_advance(begin0,
 			seqs_begin[0].first + pieces[iam][0].second,
 			begin1,
 			seqs_begin[1].first + pieces[iam][1].second,
 			target + target_position,
-			(pieces[iam][0].second - pieces[iam][0].first) + (pieces[iam][1].second - pieces[iam][1].first),
+                    (pieces[iam][0].second - pieces[iam][0].first) +
+                        (pieces[iam][1].second - pieces[iam][1].first),
 			comp);
 	}
-    }
+        } //parallel
 
 #if _GLIBCXX_ASSERTIONS
     _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
@@ -1511,7 +1742,7 @@
     return target + length;
   }
 
-  /** 
+/**
    *  @brief Multi-way merging front-end.
    *  @param seqs_begin Begin iterator of iterator pair input sequence.
    *  @param seqs_end End iterator of iterator pair input sequence.
@@ -1519,9 +1750,13 @@
    *  @param comp Comparator.
    *  @param length Maximum length to merge.
    *  @param stable Stable merging incurs a performance penalty.
-   *  @return End iterator of output sequence. 
+ *  @return End iterator of output sequence.
    */
-  template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorPairIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
   multiway_merge(RandomAccessIteratorPairIterator seqs_begin,
 		 RandomAccessIteratorPairIterator seqs_end,
@@ -1535,15 +1770,21 @@
       return target;
 
     RandomAccessIterator3 target_end;
-    if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
-      target_end = parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (difference_type)length, stable, false);
+    if (_GLIBCXX_PARALLEL_CONDITION(
+        ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
+        && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
+      target_end = parallel_multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, static_cast<difference_type>(length), stable, false);
     else
-      target_end = multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, false, sequential_tag());
+      target_end = multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, length, stable, false, sequential_tag());
 
     return target_end;
   }
 
-  /** @brief Multi-way merging front-end.
+/** @brief Multi-way merging front-end.
    *  @param seqs_begin Begin iterator of iterator pair input sequence.
    *  @param seqs_end End iterator of iterator pair input sequence.
    *  @param target Begin iterator out output sequence.
@@ -1554,7 +1795,11 @@
    *  @pre For each @c i, @c seqs_begin[i].second must be the end
    *  marker of the sequence, but also reference the one more sentinel
    *  element. */
-  template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+    typename RandomAccessIteratorPairIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
   multiway_merge_sentinel(RandomAccessIteratorPairIterator seqs_begin,
 			  RandomAccessIteratorPairIterator seqs_end,
@@ -1570,10 +1815,16 @@
 
     _GLIBCXX_CALL(seqs_end - seqs_begin)
 
-    if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
-      return parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (typename std::iterator_traits<RandomAccessIterator3>::difference_type)length, stable, true);
+    if (_GLIBCXX_PARALLEL_CONDITION(
+        ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
+        && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
+      return parallel_multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, static_cast<difference_type>(length), stable, true);
     else
-      return multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, true, sequential_tag());
+      return multiway_merge(
+          seqs_begin, seqs_end,
+          target, comp, length, stable, true, sequential_tag());
   }
 }
 
Index: include/parallel/losertree.h
===================================================================
--- include/parallel/losertree.h	(revision 130225)
+++ include/parallel/losertree.h	(working copy)
@@ -29,9 +29,9 @@
 // Public License.
 
 /** @file parallel/losertree.h
- *  @brief Many generic loser tree variants.
- *  This file is a GNU parallel extension to the Standard C++ Library.
- */
+*  @brief Many generic loser tree variants.
+*  This file is a GNU parallel extension to the Standard C++ Library.
+*/
 
 // Written by Johannes Singler.
 
@@ -49,13 +49,13 @@
 
 #if _GLIBCXX_LOSER_TREE_EXPLICIT
 
-  /** @brief Guarded loser tree, copying the whole element into the
-   * tree structure.
-   *
-   *  Guarding is done explicitly through two flags per element, inf
-   *  and sup This is a quite slow variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, copying the whole element into the
+* tree structure.
+*
+*  Guarding is done explicitly through two flags per element, inf
+*  and sup This is a quite slow variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTreeExplicit
   {
   private:
@@ -76,7 +76,9 @@
     Comparator comp;
 
   public:
-    inline LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>())
+      : comp(_comp)
     {
       size = _size;
       offset = size;
@@ -93,9 +95,6 @@
     inline ~LoserTreeExplicit()
     { delete[] losers; }
 
-    inline void
-    print() { }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -106,7 +105,8 @@
       bool inf = false;
       for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
 	{
-	  if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup && comp(losers[pos].key, key)) || losers[pos].inf || sup)
+          if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
+               && comp(losers[pos].key, key)) || losers[pos].inf || sup)
 	    {
 	      // The other one is smaller.
 	      std::swap(losers[pos].key, key);
@@ -133,7 +133,7 @@
       for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
 	{
 	  // The smaller one gets promoted.
-	  if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup 
+          if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
 	       && comp(losers[pos].key, key))
 	      || losers[pos].inf || sup)
 	    {
@@ -209,14 +209,14 @@
 
 #if _GLIBCXX_LOSER_TREE
 
-  /** @brief Guarded loser tree, either copying the whole element into
-   * the tree structure, or looking up the element via the index.
-   *
-   *  Guarding is done explicitly through one flag sup per element,
-   *  inf is not needed due to a better initialization routine.  This
-   *  is a well-performing variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, either copying the whole element into
+* the tree structure, or looking up the element via the index.
+*
+*  Guarding is done explicitly through one flag sup per element,
+*  inf is not needed due to a better initialization routine.  This
+*  is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTree
   {
   private:
@@ -240,7 +240,7 @@
       // Next greater power of 2.
       k = 1 << (log2(ik - 1) + 1);
       offset = k;
-      losers = new Loser[k * 2];
+      losers = static_cast<Loser*>(::operator new(k * 2 * sizeof(Loser)));
       for (unsigned int i = ik - 1; i < k; i++)
 	losers[i + k].sup = true;
     }
@@ -248,14 +248,6 @@
     inline ~LoserTree()
     { delete[] losers; }
 
-    void
-    print()
-    {
-      for (unsigned int i = 0; i < (k * 2); i++)
-	printf("%d    %d from %d,  %d\n", i, losers[i].key, 
-	       losers[i].source, losers[i].sup);
-    }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -267,7 +259,7 @@
 
       losers[pos].sup = sup;
       losers[pos].source = source;
-      losers[pos].key = key;
+      new(&(losers[pos].key)) T(key);
     }
 
     unsigned int
@@ -282,7 +274,8 @@
 	  unsigned int left = init_winner (2 * root);
 	  unsigned int right = init_winner (2 * root + 1);
 	  if (losers[right].sup ||
-	      (!losers[left].sup && !comp(losers[right].key, losers[left].key)))
+              (!losers[left].sup
+                && !comp(losers[right].key, losers[left].key)))
 	    {
 	      // Left one is less or equal.
 	      losers[root] = losers[right];
@@ -337,8 +330,9 @@
 	{
 	  unsigned int left = init_winner (2 * root);
 	  unsigned int right = init_winner (2 * root + 1);
-	  if (	losers[right].sup ||
-		(!losers[left].sup && !comp(losers[right].key, losers[left].key)))
+          if (losers[right].sup
+              || (!losers[left].sup
+                && !comp(losers[right].key, losers[left].key)))
 	    {
 	      // Left one is less or equal.
 	      losers[root] = losers[right];
@@ -365,10 +359,11 @@
       for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
 	{
 	  // The smaller one gets promoted, ties are broken by source.
-	  if (	(sup && (!losers[pos].sup || losers[pos].source < source)) ||
-		(!sup && !losers[pos].sup &&
-		 ((comp(losers[pos].key, key)) ||
-		  (!comp(key, losers[pos].key) && losers[pos].source < source))))
+          if (	(sup && (!losers[pos].sup || losers[pos].source < source))
+                || (!sup && !losers[pos].sup
+                  && ((comp(losers[pos].key, key))
+                    || (!comp(key, losers[pos].key)
+                      && losers[pos].source < source))))
 	    {
 	      // The other one is smaller.
 	      std::swap(losers[pos].sup, sup);
@@ -387,14 +382,14 @@
 
 #if _GLIBCXX_LOSER_TREE_REFERENCE
 
-  /** @brief Guarded loser tree, either copying the whole element into
-   * the tree structure, or looking up the element via the index.
-   *
-   *  Guarding is done explicitly through one flag sup per element,
-   *  inf is not needed due to a better initialization routine.  This
-   *  is a well-performing variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, either copying the whole element into
+* the tree structure, or looking up the element via the index.
+*
+*  Guarding is done explicitly through one flag sup per element,
+*  inf is not needed due to a better initialization routine.  This
+*  is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTreeReference
   {
 #undef COPY
@@ -423,7 +418,9 @@
     Comparator comp;
 
   public:
-    inline LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>())
+      : comp(_comp)
     {
       ik = _k;
 
@@ -446,13 +443,6 @@
 #endif
     }
 
-    void
-    print()
-    {
-      for (unsigned int i = 0; i < (k * 2); i++)
-	printf("%d    %d from %d,  %d\n", i, KEY(i), losers[i].source, losers[i].sup);
-    }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -570,7 +560,8 @@
 	  if (	(sup && (!losers[pos].sup || losers[pos].source < source)) ||
 		(!sup && !losers[pos].sup &&
 		 ((comp(KEY(pos), KEY_SOURCE(source))) ||
-		  (!comp(KEY_SOURCE(source), KEY(pos)) && losers[pos].source < source))))
+                  (!comp(KEY_SOURCE(source), KEY(pos))
+                    && losers[pos].source < source))))
 	    {
 	      // The other one is smaller.
 	      std::swap(losers[pos].sup, sup);
@@ -595,13 +586,13 @@
 
 #if _GLIBCXX_LOSER_TREE_POINTER
 
-  /** @brief Guarded loser tree, either copying the whole element into
+/** @brief Guarded loser tree, either copying the whole element into
       the tree structure, or looking up the element via the index.
-   *  Guarding is done explicitly through one flag sup per element,
-   *  inf is not needed due to a better initialization routine.
-   *  This is a well-performing variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+*  Guarding is done explicitly through one flag sup per element,
+*  inf is not needed due to a better initialization routine.
+*  This is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTreePointer
   {
   private:
@@ -617,7 +608,9 @@
     Comparator comp;
 
   public:
-    inline LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>())
+      : comp(_comp)
     {
       ik = _k;
 
@@ -632,13 +625,6 @@
     inline ~LoserTreePointer()
     { delete[] losers; }
 
-    void
-    print()
-    {
-      for (unsigned int i = 0; i < (k * 2); i++)
-	printf("%d    %d from %d,  %d\n", i, losers[i].keyp, losers[i].source, losers[i].sup);
-    }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -664,8 +650,9 @@
 	{
 	  unsigned int left = init_winner (2 * root);
 	  unsigned int right = init_winner (2 * root + 1);
-	  if (	losers[right].sup ||
-		(!losers[left].sup && !comp(*losers[right].keyp, *losers[left].keyp)))
+          if (losers[right].sup
+                || (!losers[left].sup
+                  && !comp(*losers[right].keyp, *losers[left].keyp)))
 	    {
 	      // Left one is less or equal.
 	      losers[root] = losers[right];
@@ -684,7 +671,7 @@
     init()
     { losers[0] = losers[init_winner(1)]; }
 
-    inline void 
+    inline void
     delete_min_insert(const T& key, bool sup)
     {
       const T* keyp = &key;
@@ -722,7 +709,7 @@
 	  unsigned int left = init_winner (2 * root);
 	  unsigned int right = init_winner (2 * root + 1);
 	  if (losers[right].sup
-	      || (!losers[left].sup && !comp(*losers[right].keyp, 
+              || (!losers[left].sup && !comp(*losers[right].keyp,
 					     *losers[left].keyp)))
 	    {
 	      // Left one is less or equal.
@@ -753,7 +740,7 @@
 	  if (	(sup && (!losers[pos].sup || losers[pos].source < source)) ||
 		(!sup && !losers[pos].sup &&
 		 ((comp(*losers[pos].keyp, *keyp)) ||
-		  (!comp(*keyp, *losers[pos].keyp) 
+                  (!comp(*keyp, *losers[pos].keyp)
 		   && losers[pos].source < source))))
 	    {
 	      // The other one is smaller.
@@ -773,13 +760,13 @@
 
 #if _GLIBCXX_LOSER_TREE_UNGUARDED
 
-  /** @brief Unguarded loser tree, copying the whole element into the
-   * tree structure.
-   *
-   *  No guarding is done, therefore not a single input sequence must
-   *  run empty.  This is a very fast variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Unguarded loser tree, copying the whole element into the
+* tree structure.
+*
+*  No guarding is done, therefore not a single input sequence must
+*  run empty.  This is a very fast variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTreeUnguarded
   {
   private:
@@ -809,7 +796,9 @@
     }
 
   public:
-    inline LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>())
+      : comp(_comp)
     {
       ik = _k;
       // Next greater or equal power of 2.
@@ -826,13 +815,6 @@
       delete[] mapping;
     }
 
-    void
-    print()
-    {
-      for (unsigned int i = 0; i < k + ik; i++)
-	printf("%d    %d from %d\n", i, losers[i].key, losers[i].source);
-    }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -855,7 +837,8 @@
 	  // Next greater or equal power of 2.
 	  unsigned int division = 1 << (log2(end - begin - 1));
 	  unsigned int left = init_winner(2 * root, begin, begin + division);
-	  unsigned int right = init_winner(2 * root + 1, begin + division, end);
+          unsigned int right =
+                          init_winner(2 * root + 1, begin + division, end);
 	  if (!comp(losers[right].key, losers[left].key))
 	    {
 	      // Left one is less or equal.
@@ -912,7 +895,8 @@
 	{
 	  // The smaller one gets promoted, ties are broken by source.
 	  if (comp(losers[pos].key, keyr)
-	      || (!comp(keyr, losers[pos].key) && losers[pos].source < source))
+              || (!comp(keyr, losers[pos].key)
+                && losers[pos].source < source))
 	    {
 	      // The other one is smaller.
 	      std::swap(losers[pos].source, source);
@@ -926,13 +910,13 @@
 
 #if _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED
 
-  /** @brief Unguarded loser tree, keeping only pointers to the
-   * elements in the tree structure.
-   *
-   *  No guarding is done, therefore not a single input sequence must
-   *  run empty.  This is a very fast variant.
-   */
-  template<typename T, typename Comparator = std::less<T> >
+/** @brief Unguarded loser tree, keeping only pointers to the
+* elements in the tree structure.
+*
+*  No guarding is done, therefore not a single input sequence must
+*  run empty.  This is a very fast variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
   class LoserTreePointerUnguarded
   {
   private:
@@ -961,7 +945,10 @@
     }
 
   public:
-    inline LoserTreePointerUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+    inline
+    LoserTreePointerUnguarded(unsigned int _k,
+                              Comparator _comp = std::less<T>())
+      : comp(_comp)
     {
       ik = _k;
 
@@ -979,13 +966,6 @@
       delete[] mapping;
     }
 
-    void
-    print()
-    {
-      for (unsigned int i = 0; i < k + ik; i++)
-	printf("%d    %d from %d\n", i, *losers[i].keyp, losers[i].source);
-    }
-
     inline int
     get_min_source()
     { return losers[0].source; }
@@ -1008,7 +988,8 @@
 	  // Next greater or equal power of 2.
 	  unsigned int division = 1 << (log2(end - begin - 1));
 	  unsigned int left = init_winner(2 * root, begin, begin + division);
-	  unsigned int right = init_winner(2 * root + 1, begin + division, end);
+          unsigned int right
+                          = init_winner(2 * root + 1, begin + division, end);
 	  if (!comp(*losers[right].keyp, *losers[left].keyp))
 	    {
 	      // Left one is less or equal.
@@ -1066,7 +1047,7 @@
 	{
 	  // The smaller one gets promoted, ties are broken by source.
 	  if (comp(*losers[pos].keyp, *keyp)
-	      || (!comp(*keyp, *losers[pos].keyp) 
+              || (!comp(*keyp, *losers[pos].keyp)
 		  && losers[pos].source < source))
 	    {
 	      // The other one is smaller.
@@ -1079,7 +1060,7 @@
   };
 #endif
 
-  template<typename _ValueTp, class Comparator>
+template<typename _ValueTp, class Comparator>
   struct loser_tree_traits
   {
 #if _GLIBCXX_LOSER_TREE
@@ -1093,7 +1074,7 @@
 #endif
   };
 
-  template<typename _ValueTp, class Comparator>
+template<typename _ValueTp, class Comparator>
   struct loser_tree_unguarded_traits
   {
 #if _GLIBCXX_LOSER_TREE_UNGUARDED
Index: include/parallel/workstealing.h
===================================================================
--- include/parallel/workstealing.h	(revision 130225)
+++ include/parallel/workstealing.h	(working copy)
@@ -55,8 +55,8 @@
 
 #define _GLIBCXX_JOB_VOLATILE volatile
 
-  /** @brief One job for a certain thread. */
-  template<typename _DifferenceTp>
+/** @brief One job for a certain thread. */
+template<typename _DifferenceTp>
   struct Job
   {
     typedef _DifferenceTp difference_type;
@@ -78,7 +78,7 @@
     _GLIBCXX_JOB_VOLATILE difference_type load;
   };
 
-  /** @brief Work stealing algorithm for random access iterators.
+/** @brief Work stealing algorithm for random access iterators.
    *
    *  Uses O(1) additional memory. Synchronization at job lists is
    *  done with atomic operations.
@@ -96,13 +96,20 @@
    *  std::count_n()).
    *  @return User-supplied functor (that may contain a part of the result).
    */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<
+    typename RandomAccessIterator,
+    typename Op,
+    typename Fu,
+    typename Red,
+    typename Result>
   Op
-  for_each_template_random_access_workstealing(RandomAccessIterator begin,
+  for_each_template_random_access_workstealing(
+      RandomAccessIterator begin,
 					       RandomAccessIterator end,
 					       Op op, Fu& f, Red r,
 					       Result base, Result& output,
-					       typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          bound)
   {
     _GLIBCXX_CALL(end - begin)
 
@@ -110,34 +117,43 @@
     typedef typename traits_type::difference_type difference_type;
 
 
-    difference_type chunk_size = static_cast<difference_type>(Settings::workstealing_chunk_size);
+    difference_type chunk_size =
+        static_cast<difference_type>(Settings::workstealing_chunk_size);
 
     // How many jobs?
     difference_type length = (bound < 0) ? (end - begin) : bound;
 
     // To avoid false sharing in a cache line.
-    const int stride = Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
+    const int stride =
+        Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
 
     // Total number of threads currently working.
     thread_index_t busy = 0;
-    thread_index_t num_threads = get_max_threads();
-    difference_type num_threads_min = num_threads < end - begin ? num_threads : end - begin;
 
+    Job<difference_type> *job;
+
     omp_lock_t output_lock;
     omp_init_lock(&output_lock);
 
+    // Write base value to output.
+    output = base;
+
     // No more threads than jobs, at least one thread.
-    difference_type num_threads_max = num_threads_min > 1 ? num_threads_min : 1;
-    num_threads = static_cast<thread_index_t>(num_threads_max);
+    thread_index_t num_threads =
+        __gnu_parallel::max<thread_index_t>(1,
+            __gnu_parallel::min<difference_type>(length, get_max_threads()));
 
+#   pragma omp parallel shared(busy) num_threads(num_threads)
+      {
+
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+
     // Create job description array.
-    Job<difference_type> *job = new Job<difference_type>[num_threads * stride];
+            job = new Job<difference_type>[num_threads * stride];
+          }
 
-    // Write base value to output.
-    output = base;
-
-#pragma omp parallel shared(busy) num_threads(num_threads)
-    {
       // Initialization phase.
 
       // Flags for every thread if it is doing productive work.
@@ -158,19 +174,22 @@
       // Number of elements to steal in one attempt.
       difference_type steal;
 
-      // Every thread has its own random number generator (modulo num_threads).
+        // Every thread has its own random number generator
+        // (modulo num_threads).
       random_number rand_gen(iam, num_threads);
 
-#pragma omp atomic
       // This thread is currently working.
+#       pragma omp atomic
       busy++;
 
       iam_working = true;
 
       // How many jobs per thread? last thread gets the rest.
-      my_job.first = static_cast<difference_type>(iam * (length / num_threads));
+        my_job.first =
+            static_cast<difference_type>(iam * (length / num_threads));
 
-      my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1);
+        my_job.last = (iam == (num_threads - 1)) ?
+            (length - 1) : ((iam + 1) * (length / num_threads) - 1);
       my_job.load = my_job.last - my_job.first + 1;
 
       // Init result with first value (to have a base value for reduction).
@@ -185,26 +204,29 @@
 
       RandomAccessIterator current;
 
-#pragma omp barrier
+#       pragma omp barrier
 
       // Actual work phase
       // Work on own or stolen start
       while (busy > 0)
 	{
 	  // Work until no productive thread left.
-#pragma omp flush(busy)
+#           pragma omp flush(busy)
 
 	  // Thread has own work to do
 	  while (my_job.first <= my_job.last)
 	    {
 	      // fetch-and-add call
 	      // Reserve current job block (size chunk_size) in my queue.
-	      difference_type current_job = fetch_and_add<difference_type>(&(my_job.first), chunk_size);
+                difference_type current_job =
+                  fetch_and_add<difference_type>(&(my_job.first), chunk_size);
 
 	      // Update load, to make the three values consistent,
 	      // first might have been changed in the meantime
 	      my_job.load = my_job.last - my_job.first + 1;
-	      for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; job_counter++)
+                for (difference_type job_counter = 0;
+                     job_counter < chunk_size && current_job <= my_job.last;
+                     job_counter++)
 		{
 		  // Yes: process it!
 		  current = begin + current_job;
@@ -214,15 +236,14 @@
 		  result = r(result, f(op, current));
 		}
 
-#pragma omp flush(busy)
-
+#               pragma omp flush(busy)
 	    }
 
 	  // After reaching this point, a thread's job list is empty.
 	  if (iam_working)
 	    {
-#pragma omp atomic
 	      // This thread no longer has work.
+#               pragma omp atomic
 	      busy--;
 
 	      iam_working = false;
@@ -231,16 +252,17 @@
 	  difference_type supposed_first, supposed_last, supposed_load;
 	  do
 	    {
-	      // Find random nonempty deque (not own) and do consistency check.
+                // Find random nonempty deque (not own), do consistency check.
 	      yield();
-#pragma omp flush(busy)
+#               pragma omp flush(busy)
 	      victim = rand_gen();
 	      supposed_first = job[victim * stride].first;
 	      supposed_last = job[victim * stride].last;
 	      supposed_load = job[victim * stride].load;
 	    }
 	  while (busy > 0
-		 && ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last)));
+              && ((supposed_load <= 0)
+                || ((supposed_first + supposed_load - 1) != supposed_last)));
 
 	  if (busy == 0)
 	    break;
@@ -251,40 +273,30 @@
 	      // Number of elements to steal (at least one).
 	      steal = (supposed_load < 2) ? 1 : supposed_load / 2;
 
-	      // Protects against stealing threads
-	      // omp_set_lock(&(job[victim * stride].lock));
-
 	      // Push victim's start forward.
-	      difference_type stolen_first = fetch_and_add<difference_type>(&(job[victim * stride].first), steal);
-	      difference_type stolen_try = stolen_first + steal - difference_type(1);
+                difference_type stolen_first =
+                    fetch_and_add<difference_type>(
+                        &(job[victim * stride].first), steal);
+                difference_type stolen_try =
+                    stolen_first + steal - difference_type(1);
 
-	      // Protects against working thread
-	      // omp_unset_lock(&(job[victim * stride].lock));
-
 	      my_job.first = stolen_first;
-	      
-	      // Avoid std::min dependencies.
-	      my_job.last = stolen_try < supposed_last ? stolen_try : supposed_last;
-
+                my_job.last = __gnu_parallel::min(stolen_try, supposed_last);
 	      my_job.load = my_job.last - my_job.first + 1;
 
-	      //omp_unset_lock(&(my_job.lock));
-
-#pragma omp atomic
 	      // Has potential work again.
+#               pragma omp atomic
 	      busy++;
 	      iam_working = true;
 
-#pragma omp flush(busy)
+#               pragma omp flush(busy)
 	    }
-#pragma omp flush(busy)
+#           pragma omp flush(busy)
 	} // end while busy > 0
       // Add accumulated result to output.
       omp_set_lock(&output_lock);
       output = r(output, result);
       omp_unset_lock(&output_lock);
-
-      //omp_destroy_lock(&(my_job.lock));
     }
 
     delete[] job;
Index: include/parallel/base.h
===================================================================
--- include/parallel/base.h	(revision 130225)
+++ include/parallel/base.h	(working copy)
@@ -49,12 +49,12 @@
   // XXX remove std::duplicates from here if possible,
   // XXX but keep minimal dependencies.
 
-  /** @brief Calculates the rounded-down logarithm of @c n for base 2.
+/** @brief Calculates the rounded-down logarithm of @c n for base 2.
    *  @param n Argument.
    *  @return Returns 0 for argument 0.
    */
-  template<typename Size> 
-    inline Size 
+template<typename Size>
+  inline Size
     log2(Size n)
     {
       Size k;
@@ -63,21 +63,21 @@
       return k;
     }
 
-  /** @brief Encode two integers into one __gnu_parallel::lcas_t.
+/** @brief Encode two integers into one __gnu_parallel::lcas_t.
    *  @param a First integer, to be encoded in the most-significant @c
    *  lcas_t_bits/2 bits.
    *  @param b Second integer, to be encoded in the least-significant
    *  @c lcas_t_bits/2 bits.
    *  @return __gnu_parallel::lcas_t value encoding @c a and @c b.
-   *  @see decode2 
+  *  @see decode2
    */
-  inline lcas_t
-  encode2(int a, int b)	//must all be non-negative, actually
-  {
+inline lcas_t
+encode2(int a, int b)	//must all be non-negative, actually
+{
     return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0);
-  }
+}
 
-  /** @brief Decode two integers from one __gnu_parallel::lcas_t.
+/** @brief Decode two integers from one __gnu_parallel::lcas_t.
    *  @param x __gnu_parallel::lcas_t to decode integers from.
    *  @param a First integer, to be decoded from the most-significant
    *  @c lcas_t_bits/2 bits of @c x.
@@ -85,18 +85,34 @@
    *  @c lcas_t_bits/2 bits of @c x.
    *  @see encode2
    */
-  inline void
-  decode2(lcas_t x, int& a, int& b)
-  {
+inline void
+decode2(lcas_t x, int& a, int& b)
+{
     a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask);
     b = (int)((x >>               0 ) & lcas_t_mask);
-  }
+}
 
-  /** @brief Constructs predicate for equality from strict weak
+/** @brief Equivalent to std::min. */
+template<typename T>
+  const T&
+  min(const T& a, const T& b)
+  {
+    return (a < b) ? a : b;
+  };
+
+/** @brief Equivalent to std::max. */
+template<typename T>
+  const T&
+  max(const T& a, const T& b)
+  {
+    return (a > b) ? a : b;
+  };
+
+/** @brief Constructs predicate for equality from strict weak
    *  ordering predicate
    */
-  // XXX comparator at the end, as per others
-  template<typename Comparator, typename T1, typename T2>
+// XXX comparator at the end, as per others
+template<typename Comparator, typename T1, typename T2>
   class equal_from_less : public std::binary_function<T1, T2, bool>
   {
   private:
@@ -112,8 +128,9 @@
   };
 
 
-  /** @brief Similar to std::binder1st, but giving the argument types explicitly. */
-  template<typename _Predicate, typename argument_type>
+/** @brief Similar to std::binder1st,
+  *  but giving the argument types explicitly. */
+template<typename _Predicate, typename argument_type>
     class unary_negate
     : public std::unary_function<argument_type, bool>
     {
@@ -129,8 +146,13 @@
       { return !_M_pred(__x); }
     };
 
-  /** @brief Similar to std::binder1st, but giving the argument types explicitly. */
-  template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
+/** @brief Similar to std::binder1st,
+  *  but giving the argument types explicitly. */
+template<
+    typename _Operation,
+    typename first_argument_type,
+    typename second_argument_type,
+    typename result_type>
     class binder1st
     : public std::unary_function<second_argument_type, result_type>
     {
@@ -154,11 +176,15 @@
       { return op(value, __x); }
     };
 
-  /** 
+/**
    *  @brief Similar to std::binder2nd, but giving the argument types
-   *  explicitly. 
+  *  explicitly.
    */
-  template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
+template<
+    typename _Operation,
+    typename first_argument_type,
+    typename second_argument_type,
+    typename result_type>
     class binder2nd
     : public std::unary_function<first_argument_type, result_type>
     {
@@ -182,30 +208,30 @@
       { return op(__x, value); }
     };
 
-  /** @brief Similar to std::equal_to, but allows two different types. */
-  template<typename T1, typename T2>
+/** @brief Similar to std::equal_to, but allows two different types. */
+template<typename T1, typename T2>
   struct equal_to : std::binary_function<T1, T2, bool>
   {
     bool operator()(const T1& t1, const T2& t2) const
     { return t1 == t2; }
   };
 
-  /** @brief Similar to std::less, but allows two different types. */
-  template<typename T1, typename T2>
+/** @brief Similar to std::less, but allows two different types. */
+template<typename T1, typename T2>
   struct less : std::binary_function<T1, T2, bool>
   {
-    bool 
+    bool
     operator()(const T1& t1, const T2& t2) const
     { return t1 < t2; }
 
-    bool 
+    bool
     operator()(const T2& t2, const T1& t1) const
     { return t2 < t1; }
   };
 
-  // Partial specialization for one type. Same as std::less.
-  template<typename _Tp>
-  struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
+// Partial specialization for one type. Same as std::less.
+template<typename _Tp>
+struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
     {
       bool
       operator()(const _Tp& __x, const _Tp& __y) const
@@ -214,21 +240,23 @@
 
 
     /** @brief Similar to std::plus, but allows two different types. */
-  template<typename _Tp1, typename _Tp2>
+template<typename _Tp1, typename _Tp2>
     struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1>
     {
-      typedef typeof(*static_cast<_Tp1*>(NULL) + *static_cast<_Tp2*>(NULL)) result;
+    typedef typeof(*static_cast<_Tp1*>(NULL)
+                    + *static_cast<_Tp2*>(NULL)) result;
 
       result
       operator()(const _Tp1& __x, const _Tp2& __y) const
       { return __x + __y; }
     };
 
-  // Partial specialization for one type. Same as std::plus.
-  template<typename _Tp>
+// Partial specialization for one type. Same as std::plus.
+template<typename _Tp>
     struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
     {
-      typedef typeof(*static_cast<_Tp*>(NULL) + *static_cast<_Tp*>(NULL)) result;
+    typedef typeof(*static_cast<_Tp*>(NULL)
+                    + *static_cast<_Tp*>(NULL)) result;
 
       result
       operator()(const _Tp& __x, const _Tp& __y) const
@@ -236,22 +264,24 @@
     };
 
 
-  /** @brief Similar to std::multiplies, but allows two different types. */
-  template<typename _Tp1, typename _Tp2>
+/** @brief Similar to std::multiplies, but allows two different types. */
+template<typename _Tp1, typename _Tp2>
     struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1>
     {
-      typedef typeof(*static_cast<_Tp1*>(NULL) * *static_cast<_Tp2*>(NULL)) result;
+    typedef typeof(*static_cast<_Tp1*>(NULL)
+                    * *static_cast<_Tp2*>(NULL)) result;
 
       result
       operator()(const _Tp1& __x, const _Tp2& __y) const
       { return __x * __y; }
     };
 
-  // Partial specialization for one type. Same as std::multiplies.
-  template<typename _Tp>
+// Partial specialization for one type. Same as std::multiplies.
+template<typename _Tp>
     struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
     {
-      typedef typeof(*static_cast<_Tp*>(NULL) * *static_cast<_Tp*>(NULL)) result;
+    typedef typeof(*static_cast<_Tp*>(NULL)
+                    * *static_cast<_Tp*>(NULL)) result;
 
       result
       operator()(const _Tp& __x, const _Tp& __y) const
@@ -259,15 +289,15 @@
     };
 
 
-  template<typename T, typename _DifferenceTp>
+template<typename T, typename _DifferenceTp>
   class pseudo_sequence;
 
-  /** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
+/** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
    *  If features the usual random-access iterator functionality.
    *  @param T Sequence value type.
-   *  @param difference_type Sequence difference type. 
+  *  @param difference_type Sequence difference type.
    */
-  template<typename T, typename _DifferenceTp>
+template<typename T, typename _DifferenceTp>
   class pseudo_sequence_iterator
   {
   public:
@@ -296,34 +326,34 @@
     operator++(int)
     { return type(pos++); }
 
-    const T& 
+    const T&
     operator*() const
     { return val; }
 
-    const T& 
+    const T&
     operator[](difference_type) const
     { return val; }
 
-    bool 
+    bool
     operator==(const type& i2)
     { return pos == i2.pos; }
 
-    difference_type 
+    difference_type
     operator!=(const type& i2)
     { return pos != i2.pos; }
 
-    difference_type 
+    difference_type
     operator-(const type& i2)
     { return pos - i2.pos; }
   };
 
-  /** @brief Sequence that conceptually consists of multiple copies of
+/** @brief Sequence that conceptually consists of multiple copies of
       the same element.
    *  The copies are not stored explicitly, of course.
    *  @param T Sequence value type.
-   *  @param difference_type Sequence difference type. 
+  *  @param difference_type Sequence difference type.
    */
-  template<typename T, typename _DifferenceTp>
+template<typename T, typename _DifferenceTp>
   class pseudo_sequence
   {
     typedef pseudo_sequence<T, _DifferenceTp> type;
@@ -338,7 +368,7 @@
      *  @param val Element of the sequence.
      *  @param count Number of (virtual) copies.
      */
-    pseudo_sequence(const T& val, difference_type count) 
+    pseudo_sequence(const T& val, difference_type count)
     : val(val), count(count)  { }
 
     /** @brief Begin iterator. */
@@ -356,24 +386,24 @@
     difference_type count;
   };
 
-  /** @brief Functor that does nothing */
-  template<typename _ValueTp>
+/** @brief Functor that does nothing */
+template<typename _ValueTp>
   class void_functor
   {
-    inline void 
+    inline void
     operator()(const _ValueTp& v) const { }
   };
 
-  /** @brief Compute the median of three referenced elements,
+/** @brief Compute the median of three referenced elements,
       according to @c comp.
    *  @param a First iterator.
    *  @param b Second iterator.
    *  @param c Third iterator.
-   *  @param comp Comparator. 
+  *  @param comp Comparator.
    */
-  template<typename RandomAccessIterator, typename Comparator>
-  RandomAccessIterator
-  median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b, 
+template<typename RandomAccessIterator, typename Comparator>
+RandomAccessIterator
+  median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b,
 			    RandomAccessIterator c, Comparator& comp)
   {
     if (comp(*a, *b))
@@ -397,26 +427,25 @@
       }
   }
 
-  // Avoid the use of assert, because we're trying to keep the <cassert>
-  // include out of the mix. (Same as debug mode).
-  inline void
-  __replacement_assert(const char* __file, int __line, 
+// Avoid the use of assert, because we're trying to keep the <cassert>
+// include out of the mix. (Same as debug mode).
+inline void
+__replacement_assert(const char* __file, int __line,
 		       const char* __function, const char* __condition)
-  {
+{
     std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line,
 		__function, __condition);
     __builtin_abort();
-  }
-  
+}
+
 #define _GLIBCXX_PARALLEL_ASSERT(_Condition)                            \
-  do 								        \
+do 								        \
     {									\
       if (!(_Condition))						\
 	__gnu_parallel::__replacement_assert(__FILE__, __LINE__,	\
 				    __PRETTY_FUNCTION__, #_Condition);	\
     } while (false)
-  
+
 } //namespace __gnu_parallel
 
 #endif
-
Index: include/parallel/par_loop.h
===================================================================
--- include/parallel/par_loop.h	(revision 130225)
+++ include/parallel/par_loop.h	(working copy)
@@ -41,11 +41,12 @@
 
 #include <omp.h>
 #include <parallel/settings.h>
+#include <parallel/base.h>
 
 namespace __gnu_parallel
 {
 
-  /** @brief Embarrassingly parallel algorithm for random access
+/** @brief Embarrassingly parallel algorithm for random access
    * iterators, using hand-crafted parallelization by equal splitting
    * the work.
    *
@@ -63,47 +64,57 @@
    *  std::count_n()).
    *  @return User-supplied functor (that may contain a part of the result).
    */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<
+    typename RandomAccessIterator,
+    typename Op,
+    typename Fu,
+    typename Red,
+    typename Result>
   Op
-  for_each_template_random_access_ed(RandomAccessIterator begin,
-				     RandomAccessIterator end, Op o, Fu& f,
-				     Red r, Result base, Result& output,
-				     typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+  for_each_template_random_access_ed(
+              RandomAccessIterator begin,
+              RandomAccessIterator end,
+              Op o, Fu& f, Red r, Result base, Result& output,
+              typename std::iterator_traits<RandomAccessIterator>::
+                  difference_type bound)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::difference_type difference_type;
 
     const difference_type length = end - begin;
-    const difference_type settings_threads = static_cast<difference_type>(get_max_threads());
-    const difference_type dmin = settings_threads < length ? settings_threads : length;
-    const difference_type dmax = dmin > 1 ? dmin : 1;
+    Result *thread_results;
 
-    thread_index_t num_threads = static_cast<thread_index_t>(dmax);
+    thread_index_t num_threads =
+        __gnu_parallel::min<difference_type>(get_max_threads(), length);
 
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+          }
 
-    Result *thread_results = new Result[num_threads];
+        thread_index_t iam = omp_get_thread_num();
 
-#pragma omp parallel num_threads(num_threads)
-    {
       // Neutral element.
       Result reduct = Result();
 
-      thread_index_t p = num_threads;
-      thread_index_t iam = omp_get_thread_num();
-      difference_type start = iam * length / p;
-      difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
+        difference_type
+            start = equally_split_point(length, num_threads, iam),
+            stop = equally_split_point(length, num_threads, iam + 1);
 
-      if (start < limit)
+        if (start < stop)
 	{
 	  reduct = f(o, begin + start);
-	  start++;
+            ++start;
 	}
 
-      for (; start < limit; start++)
+        for (; start < stop; ++start)
 	reduct = r(reduct, f(o, begin + start));
 
       thread_results[iam] = reduct;
-    }
+      } //parallel
 
     for (thread_index_t i = 0; i < num_threads; i++)
       output = r(output, thread_results[i]);
Index: include/parallel/features.h
===================================================================
--- include/parallel/features.h	(revision 130225)
+++ include/parallel/features.h	(working copy)
@@ -66,7 +66,7 @@
  *  @brief Include guarded (sequences may run empty) loser tree,
  *  moving objects.
  *  @see __gnu_parallel::Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE 0
+#define _GLIBCXX_LOSER_TREE 1
 #endif
 
 #ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
Index: include/parallel/quicksort.h
===================================================================
--- include/parallel/quicksort.h	(revision 130225)
+++ include/parallel/quicksort.h	(working copy)
@@ -53,11 +53,17 @@
    *  this part.
    */
   template<typename RandomAccessIterator, typename Comparator>
-  inline typename std::iterator_traits<RandomAccessIterator>::difference_type
-  parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end,
+  inline
+  typename std::iterator_traits<RandomAccessIterator>::difference_type
+  parallel_sort_qs_divide(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
 			  Comparator comp,
-			  typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank,
-			  typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads)
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          pivot_rank,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type
+          num_samples,
+      thread_index_t num_threads)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -65,20 +71,24 @@
 
     difference_type n = end - begin;
     num_samples = std::min(num_samples, n);
-    value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
 
+    // Allocate uninitialized, to avoid default constructor.
+    value_type* samples = static_cast<value_type*>(
+        operator new(num_samples * sizeof(value_type)));
+
     for (difference_type s = 0; s < num_samples; s++)
       {
-	const unsigned long long index = static_cast<unsigned long long>(s) 
+        const unsigned long long index = static_cast<unsigned long long>(s)
 	  				 * n / num_samples;
-	samples[s] = begin[index];
+        new(samples + s) value_type(begin[index]);
       }
 
     __gnu_sequential::sort(samples, samples + num_samples, comp);
 
     value_type& pivot = samples[pivot_rank * num_samples / n];
 
-    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot);
+    __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
+        pred(comp, pivot);
     difference_type split = parallel_partition(begin, end, pred, num_threads);
 
     return split;
@@ -93,7 +103,10 @@
    */
   template<typename RandomAccessIterator, typename Comparator>
   inline void
-  parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads)
+  parallel_sort_qs_conquer(RandomAccessIterator begin,
+                           RandomAccessIterator end,
+                           Comparator comp,
+                           thread_index_t num_threads)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -110,24 +123,27 @@
     if (n <= 1)
       return;
 
-    thread_index_t num_processors_left;
+    thread_index_t num_threads_left;
 
     if ((num_threads % 2) == 1)
-      num_processors_left = num_threads / 2 + 1;
+      num_threads_left = num_threads / 2 + 1;
     else
-      num_processors_left = num_threads / 2;
+      num_threads_left = num_threads / 2;
 
-    pivot_rank = n * num_processors_left / num_threads;
+    pivot_rank = n * num_threads_left / num_threads;
 
-    difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank,
-Settings::sort_qs_num_samples_preset, num_threads);
+    difference_type split = parallel_sort_qs_divide(
+        begin, end, comp, pivot_rank,
+        Settings::sort_qs_num_samples_preset, num_threads);
 
 #pragma omp parallel sections
     {
 #pragma omp section
-      parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left);
+      parallel_sort_qs_conquer(begin, begin + split,
+                               comp, num_threads_left);
 #pragma omp section
-      parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left);
+      parallel_sort_qs_conquer(begin + split, end,
+                               comp, num_threads - num_threads_left);
     }
   }
 
@@ -143,9 +159,12 @@
    */
   template<typename RandomAccessIterator, typename Comparator>
   inline void
-  parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end,
+  parallel_sort_qs(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
 		   Comparator comp,
-		   typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
+      typename std::iterator_traits<RandomAccessIterator>::difference_type n,
+      int num_threads)
   {
     _GLIBCXX_CALL(n)
 
@@ -165,10 +184,7 @@
     // Hard to avoid.
     omp_set_num_threads(num_threads);
 
-    bool old_nested = (omp_get_nested() != 0);
-    omp_set_nested(true);
     parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
-    omp_set_nested(old_nested);
   }
 
 }	//namespace __gnu_parallel
Index: include/parallel/compiletime_settings.h
===================================================================
--- include/parallel/compiletime_settings.h	(revision 130225)
+++ include/parallel/compiletime_settings.h	(working copy)
@@ -39,7 +39,7 @@
 #include <cstdio>
 
 /** @brief Determine verbosity level of the parallel mode.
- *  Level 1 prints a message each time when entering a parallel-mode function. */
+ *  Level 1 prints a message each time a parallel-mode function is entered. */
 #define _GLIBCXX_VERBOSE_LEVEL 0
 
 /** @def _GLIBCXX_CALL
@@ -50,27 +50,40 @@
 #define _GLIBCXX_CALL(n)
 #endif
 #if (_GLIBCXX_VERBOSE_LEVEL == 1)
-#define _GLIBCXX_CALL(n) printf("   %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
+#define _GLIBCXX_CALL(n) \
+  printf("   %s:\niam = %d, n = %ld, num_threads = %d\n", \
+  __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
 #endif
 
+#ifndef _GLIBCXX_SCALE_DOWN_FPU
 /** @brief Use floating-point scaling instead of modulo for mapping
  *  random numbers to a range.  This can be faster on certain CPUs. */
 #define _GLIBCXX_SCALE_DOWN_FPU 0
+#endif
 
+#ifndef _GLIBCXX_ASSERTIONS
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
  *  Should be switched on only locally. */
 #define _GLIBCXX_ASSERTIONS 0
+#endif
 
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- *  Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */
+ *  Consider the size of the L1 cache for
+ *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
+#endif
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- *  Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */
+ *  Consider the size of the TLB for
+ *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
+#endif
 
+#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
 /** @brief First copy the data, sort it locally, and merge it back
  * (0); or copy it back after everything is done (1).
  *
  *  Recommendation: 0 */
 #define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
-
+#endif
Index: include/parallel/equally_split.h
===================================================================
--- include/parallel/equally_split.h	(revision 130225)
+++ include/parallel/equally_split.h	(working copy)
@@ -39,30 +39,58 @@
 
 namespace __gnu_parallel
 {
-  /** @brief Function to split a sequence into parts of almost equal size.
+/** @brief Function to split a sequence into parts of almost equal size.
    *
-   *  The resulting sequence s of length p+1 contains the splitting
+ *  The resulting sequence s of length num_threads+1 contains the splitting
    *  positions when splitting the range [0,n) into parts of almost
    *  equal size (plus minus 1).  The first entry is 0, the last one
    *  n. There may result empty parts.
    *  @param n Number of elements
-   *  @param p Number of parts
+ *  @param num_threads Number of parts
    *  @param s Splitters
-   *  @returns End of splitter sequence, i. e. @c s+p+1 */
-  template<typename _DifferenceTp, typename OutputIterator>
+ *  @returns End of splitter sequence, i. e. @c s+num_threads+1 */
+template<typename difference_type, typename OutputIterator>
   OutputIterator
-  equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s)
+  equally_split(difference_type n,
+                thread_index_t num_threads,
+                OutputIterator s)
   {
-    typedef _DifferenceTp difference_type;
-    difference_type chunk_length = n / p, split = n % p, start = 0;
-    for (int i = 0; i < p; i++)
+    difference_type chunk_length = n / num_threads,
+                    num_longer_chunks = n % num_threads,
+                    pos = 0;
+    for (thread_index_t i = 0; i < num_threads; ++i)
       {
-	*s++ = start;
-	start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length;
+        *s++ = pos;
+        pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
       }
     *s++ = n;
     return s;
   }
+
+
+/** @brief Function to split a sequence into parts of almost equal size.
+ *
+ *  Returns the position of the splitting point between
+ *  thread number thread_no (included) and
+ *  thread number thread_no+1 (excluded).
+ *  @param n Number of elements
+ *  @param num_threads Number of parts
+ *  @returns Splitting point */
+template<typename difference_type>
+  difference_type
+  equally_split_point(difference_type n,
+                      thread_index_t num_threads,
+                      thread_index_t thread_no)
+  {
+    difference_type chunk_length = n / num_threads,
+                    num_longer_chunks = n % num_threads;
+
+    if(thread_no < num_longer_chunks)
+      return thread_no * (chunk_length + 1);
+    else
+      return num_longer_chunks * (chunk_length + 1)
+          + (thread_no - num_longer_chunks) * chunk_length;
+  }
 }
 
 #endif
Index: include/parallel/omp_loop_static.h
===================================================================
--- include/parallel/omp_loop_static.h	(revision 130225)
+++ include/parallel/omp_loop_static.h	(working copy)
@@ -64,39 +64,50 @@
    *  std::count_n()).
    *  @return User-supplied functor (that may contain a part of the result).
    */
-  template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<typename RandomAccessIterator,
+          typename Op,
+          typename Fu,
+          typename Red,
+          typename Result>
   Op
-  for_each_template_random_access_omp_loop_static(RandomAccessIterator begin,
+  for_each_template_random_access_omp_loop_static(
+              RandomAccessIterator begin,
 						  RandomAccessIterator end,
-						  Op o, Fu& f, Red r,
-						  Result base, Result& output,
-						  typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+              Op o, Fu& f, Red r, Result base, Result& output,
+              typename std::iterator_traits<RandomAccessIterator>::
+                  difference_type bound)
   {
-    typedef std::iterator_traits<RandomAccessIterator> traits_type;
-    typedef typename traits_type::difference_type difference_type;
+    typedef typename
+        std::iterator_traits<RandomAccessIterator>::difference_type
+        difference_type;
 
-    thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
-    Result *thread_results = new Result[num_threads];
     difference_type length = end - begin;
+    thread_index_t num_threads =
+        std::min<difference_type>(get_max_threads(), length);
 
-    for (thread_index_t i = 0; i < num_threads; i++)
-      {
-	thread_results[i] = r(thread_results[i], f(o, begin+i));
-      }
+    Result *thread_results;
 
-#pragma omp parallel num_threads(num_threads)
+#   pragma omp parallel num_threads(num_threads)
     {
-#pragma omp for schedule(static, Settings::workstealing_chunk_size)
-      for (difference_type pos = 0; pos < length; pos++)
+#       pragma omp single
 	{
-	  thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
+            num_threads = omp_get_num_threads();
+            thread_results = new Result[num_threads];
+
+            for (thread_index_t i = 0; i < num_threads; i++)
+              thread_results[i] = Result();
 	}
-    }
 
+        thread_index_t iam = omp_get_thread_num();
+
+#       pragma omp for schedule(static, Settings::workstealing_chunk_size)
+        for (difference_type pos = 0; pos < length; pos++)
+          thread_results[iam] =
+              r(thread_results[iam], f(o, begin+pos));
+      } //parallel
+
     for (thread_index_t i = 0; i < num_threads; i++)
-      {
 	output = r(output, thread_results[i]);
-      }
 
     delete [] thread_results;
 
@@ -106,6 +117,7 @@
 
     return o;
   }
+
 } // end namespace
 
 #endif
Index: include/parallel/random_shuffle.h
===================================================================
--- include/parallel/random_shuffle.h	(revision 130225)
+++ include/parallel/random_shuffle.h	(working copy)
@@ -45,16 +45,16 @@
 
 namespace __gnu_parallel
 {
-  /** @brief Type to hold the index of a bin.
+/** @brief Type to hold the index of a bin.
    *
    *  Since many variables of this type are allocated, it should be
    *  chosen as small as possible.
    */
-  typedef unsigned short bin_index;
+typedef unsigned short bin_index;
 
-  /** @brief Data known to every thread participating in
+/** @brief Data known to every thread participating in
       __gnu_parallel::parallel_random_shuffle(). */
-  template<typename RandomAccessIterator>
+template<typename RandomAccessIterator>
   struct DRandomShufflingGlobalData
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
@@ -90,18 +90,15 @@
     : source(_source) { }
   };
 
-  /** @brief Local data for a thread participating in
+/** @brief Local data for a thread participating in
       __gnu_parallel::parallel_random_shuffle().
    */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
   struct DRSSorterPU
   {
     /** @brief Number of threads participating in total. */
     int num_threads;
 
-    /** @brief Number of owning thread. */
-    int iam;
-
     /** @brief Begin index for bins taken care of by this thread. */
     bin_index bins_begin;
 
@@ -115,29 +112,29 @@
     DRandomShufflingGlobalData<RandomAccessIterator>* sd;
   };
 
-  /** @brief Generate a random number in @c [0,2^logp).
+/** @brief Generate a random number in @c [0,2^logp).
    *  @param logp Logarithm (basis 2) of the upper range bound.
    *  @param rng Random number generator to use.
    */
-  template<typename RandomNumberGenerator>
+template<typename RandomNumberGenerator>
   inline int
   random_number_pow2(int logp, RandomNumberGenerator& rng)
   { return rng.genrand_bits(logp); }
 
-  /** @brief Random shuffle code executed by each thread.
+/** @brief Random shuffle code executed by each thread.
    *  @param pus Array of thread-local data records. */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
   inline void 
-  parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator, 
+  parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
 				 RandomNumberGenerator>* pus)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;
 
-    DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()];
+    thread_index_t iam = omp_get_thread_num();
+    DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
     DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
-    thread_index_t iam = d->iam;
 
     // Indexing: dist[bin][processor]
     difference_type length = sd->starts[iam + 1] - sd->starts[iam];
@@ -166,9 +163,9 @@
     for (bin_index b = 0; b < sd->num_bins + 1; b++)
       sd->dist[b][iam + 1] = dist[b];
 
-#pragma omp barrier
+#   pragma omp barrier
 
-#pragma omp single
+#   pragma omp single
     {
       // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
       // total number of items in bin s
@@ -178,13 +175,13 @@
 				      sd->dist[s + 1]);
     }
 
-#pragma omp barrier
+#   pragma omp barrier
 
     sequence_index_t offset = 0, global_offset = 0;
     for (bin_index s = 0; s < d->bins_begin; s++)
       global_offset += sd->dist[s + 1][d->num_threads];
 
-#pragma omp barrier
+#   pragma omp barrier
 
     for (bin_index s = d->bins_begin; s < d->bins_end; s++)
       {
@@ -193,9 +190,10 @@
 	offset = sd->dist[s + 1][d->num_threads];
       }
 
-    sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * offset));
+    sd->temporaries[iam] = static_cast<value_type*>(
+      ::operator new(sizeof(value_type) * offset));
 
-#pragma omp barrier
+#   pragma omp barrier
 
     // Draw local copies to avoid false sharing.
     for (bin_index b = 0; b < sd->num_bins + 1; b++)
@@ -223,23 +221,27 @@
     delete[] bin_proc;
     delete[] temporaries;
 
-#pragma omp barrier
+#   pragma omp barrier
 
     // Shuffle bins internally.
     for (bin_index b = d->bins_begin; b < d->bins_end; b++)
       {
-	value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
-	  * end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
+        value_type* begin =
+                    sd->temporaries[iam] +
+                    ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
+                  * end =
+                    sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
 	sequential_random_shuffle(begin, end, rng);
-	std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
+        std::copy(begin, end, sd->source + global_offset +
+            ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
       }
 
     delete[] sd->temporaries[iam];
   }
 
-  /** @brief Round up to the next greater power of 2.
+/** @brief Round up to the next greater power of 2.
    *  @param x Integer to round up */
-  template<typename T>
+template<typename T>
   T 
   round_up_to_pow2(T x)
   {
@@ -249,16 +251,21 @@
       return (T)1 << (log2(x - 1) + 1);
   }
 
-  /** @brief Main parallel random shuffle step.
+/** @brief Main parallel random shuffle step.
    *  @param begin Begin iterator of sequence.
    *  @param end End iterator of sequence.
    *  @param n Length of sequence.
    *  @param num_threads Number of threads to use.
    *  @param rng Random number generator to use.
    */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
   inline void
-  parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng)
+  parallel_random_shuffle_drs(
+      RandomAccessIterator begin,
+      RandomAccessIterator end,
+      typename std::iterator_traits<RandomAccessIterator>::difference_type n,
+      thread_index_t num_threads,
+      RandomNumberGenerator& rng)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -275,16 +282,17 @@
     // Try the L1 cache first.
 
     // Must fit into L1.
-    num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
+    num_bins_cache = std::max<difference_type>(
+        1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
     num_bins_cache = round_up_to_pow2(num_bins_cache);
 
     // No more buckets than TLB entries, power of 2
     // Power of 2 and at least one element per bin, at most the TLB size.
-    num_bins = std::min(n, (difference_type)num_bins_cache);
+    num_bins = std::min<difference_type>(n, num_bins_cache);
 
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
     // 2 TLB entries needed per bin.
-    num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
+    num_bins = std::min<difference_type>(Settings::TLB_size / 2, num_bins);
 #endif
     num_bins = round_up_to_pow2(num_bins);
 
@@ -293,32 +301,41 @@
 #endif
 	// Now try the L2 cache
 	// Must fit into L2
-	num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
+        num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
+            1, n / (Settings::L2_cache_size / sizeof(value_type))));
 	num_bins_cache = round_up_to_pow2(num_bins_cache);
 
 	// No more buckets than TLB entries, power of 2.
-	num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
+        num_bins = static_cast<bin_index>(
+            std::min(n, static_cast<difference_type>(num_bins_cache)));
 	// Power of 2 and at least one element per bin, at most the TLB size.
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
 	// 2 TLB entries needed per bin.
-	num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
+        num_bins = std::min(
+            static_cast<difference_type>(Settings::TLB_size / 2), num_bins);
 #endif
 	num_bins = round_up_to_pow2(num_bins);
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
       }
 #endif
 
-    num_threads = std::min((bin_index)num_threads, (bin_index)num_bins);
+    num_threads = std::min<bin_index>(num_threads, num_bins);
 
     if (num_threads <= 1)
       return sequential_random_shuffle(begin, end, rng);
 
     DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
+    DRSSorterPU<RandomAccessIterator, random_number >* pus;
+    difference_type* starts;
 
-    DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads];
+#   pragma omp parallel num_threads(num_threads)
+      {
+#       pragma omp single
+          {
+            pus = new DRSSorterPU<RandomAccessIterator, random_number>
+                [num_threads];
 
     sd.temporaries = new value_type*[num_threads];
-    //sd.oracles = new bin_index[n];
     sd.dist = new difference_type*[num_bins + 1];
     sd.bin_proc = new thread_index_t[num_bins];
     for (bin_index b = 0; b < num_bins + 1; b++)
@@ -328,34 +345,36 @@
 	sd.dist[0][0] = 0;
 	sd.dist[b][0] = 0;
       }
-    difference_type* starts = sd.starts = new difference_type[num_threads + 1];
+            starts = sd.starts = new difference_type[num_threads + 1];
     int bin_cursor = 0;
     sd.num_bins = num_bins;
     sd.num_bits = log2(num_bins);
 
-    difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0;
-    int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads;
-    for (int i = 0; i < num_threads; i++)
+            difference_type chunk_length = n / num_threads,
+                            split = n % num_threads, start = 0;
+            difference_type bin_chunk_length = num_bins / num_threads,
+                            bin_split = num_bins % num_threads;
+            for (thread_index_t i = 0; i < num_threads; i++)
       {
 	starts[i] = start;
 	start += (i < split) ? (chunk_length + 1) : chunk_length;
 	int j = pus[i].bins_begin = bin_cursor;
 
 	// Range of bins for this processor.
-	bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length;
+                bin_cursor += (i < bin_split) ?
+                    (bin_chunk_length + 1) : bin_chunk_length;
 	pus[i].bins_end = bin_cursor;
 	for (; j < bin_cursor; j++)
 	  sd.bin_proc[j] = i;
 	pus[i].num_threads = num_threads;
-	pus[i].iam = i;
 	pus[i].seed = rng(std::numeric_limits<uint32>::max());
 	pus[i].sd = &sd;
       }
     starts[num_threads] = start;
-
+          } //single
     // Now shuffle in parallel.
-#pragma omp parallel num_threads(num_threads)
     parallel_random_shuffle_drs_pu(pus);
+    }
 
     delete[] starts;
     delete[] sd.bin_proc;
@@ -367,15 +386,15 @@
     delete[] pus;
   }
 
-  /** @brief Sequential cache-efficient random shuffle.
+/** @brief Sequential cache-efficient random shuffle.
    *  @param begin Begin iterator of sequence.
    *  @param end End iterator of sequence.
    *  @param rng Random number generator to use.
    */
-  template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
   inline void
   sequential_random_shuffle(RandomAccessIterator begin, 
-			    RandomAccessIterator end, 
+                            RandomAccessIterator end,
 			    RandomNumberGenerator& rng)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
@@ -388,7 +407,9 @@
 
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
     // Try the L1 cache first, must fit into L1.
-    num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
+    num_bins_cache =
+        std::max<difference_type>
+            (1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
     num_bins_cache = round_up_to_pow2(num_bins_cache);
 
     // No more buckets than TLB entries, power of 2
@@ -404,16 +425,20 @@
       {
 #endif
 	// Now try the L2 cache, must fit into L2.
-	num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
+        num_bins_cache =
+            static_cast<bin_index>(std::max<difference_type>(
+                1, n / (Settings::L2_cache_size / sizeof(value_type))));
 	num_bins_cache = round_up_to_pow2(num_bins_cache);
 
 	// No more buckets than TLB entries, power of 2
 	// Power of 2 and at least one element per bin, at most the TLB size.
-	num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
+        num_bins = static_cast<bin_index>
+            (std::min(n, static_cast<difference_type>(num_bins_cache)));
 
 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
 	// 2 TLB entries needed per bin
-	num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
+        num_bins =
+            std::min<difference_type>(Settings::TLB_size / 2, num_bins);
 #e