This is the mail archive of the libstdc++@gcc.gnu.org mailing list for the libstdc++ project.
| Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
|---|---|---|
| Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
| Other format: | [Raw text] | |
> Johannes Singler wrote:
>> I'm aware of that, and got a pointer to a detailed instruction on how to
>> do that by Benjamin Kosnik only recently. I already tried to format the
>> changed parts according to the guidelines.
> Sorry, frankly this is not true: there are obviously many lines wider
> than 80 columns (remember, by the way, that it means 80 overall, CR
> included, therefore from 0 to 78 of actual code)
Well, I meant the code parts that were actually changed. Anyway, now I
reformatted all concerned files. I will process the other files later.
New patch attached.
Major revision of the parallel mode algorithms, to work correctly even
if omp_get_dynamic(), PR 33893.
Also, the quicksort variants work correctly if !omp_get_nested().
However, the speedup is limited to 2 in this case. Getting rid of this
limitation is very elaborate since OpenMP 2.5 does not support barriers
among a subset of a team. Maybe things get better with OpenMP 3.0, I
still have to look into this.
Because the process included a lot of formatting cleanup, I'm also
attaching a version of the patch ignoring white space, for better
readability.
Please approve.
tested x86_64-unknown-linux-gnu: no regressions
2007-11-21 Johannes Singler <singler@ira.uka.de>
*include/parallel/multiway_merge.h: made omp_dynamic-safe
*include/parallel/workstealing.h: made omp_dynamic-safe
*include/parallel/base.h: infrastructure, cleanup
*include/parallel/par_loop.h: made omp_dynamic-safe
*include/parallel/features.h: activate loser tree variant
*include/parallel/quicksort.h: made omp_dynamic-safe
*include/parallel/compiletime_settings.h: settings overridable
*include/parallel/equally_split.h: made omp_dynamic-safe
*include/parallel/omp_loop_static.h: made omp_dynamic-safe
*include/parallel/random_shuffle.h: made omp_dynamic-safe
*include/parallel/balanced_quicksort.h: made omp_dynamic-safe
*include/parallel/set_operations.h: made omp_dynamic-safe
*include/parallel/unique_copy.h: made omp_dynamic-safe
*include/parallel/multiway_mergesort.h: made omp_dynamic-safe
*include/parallel/search.h: made omp_dynamic-safe
*include/parallel/partition.h: made omp_dynamic-safe
*include/parallel/partial_sum.h: made omp_dynamic-safe
*include/parallel/find.h: made omp_dynamic-safe
*include/parallel/omp_loop.h: made omp_dynamic-safe
*include/parallel/losertree.h: avoid default constructor
Johannes
Index: include/parallel/multiway_merge.h
===================================================================
--- include/parallel/multiway_merge.h (revision 130225)
+++ include/parallel/multiway_merge.h (working copy)
@@ -29,16 +29,16 @@
// Public License.
/** @file parallel/multiway_merge.h
- * @brief Implementation of sequential and parallel multiway merge.
- *
- * Explanations on the high-speed merging routines in the appendix of
- *
- * P. Sanders.
- * Fast priority queues for cached memory.
- * ACM Journal of Experimental Algorithmics, 5, 2000.
- *
- * This file is a GNU parallel extension to the Standard C++ Library.
- */
+* @brief Implementation of sequential and parallel multiway merge.
+*
+* Explanations on the high-speed merging routines in the appendix of
+*
+* P. Sanders.
+* Fast priority queues for cached memory.
+* ACM Journal of Experimental Algorithmics, 5, 2000.
+*
+* This file is a GNU parallel extension to the Standard C++ Library.
+*/
// Written by Johannes Singler.
@@ -62,15 +62,15 @@
// XXX need iterator typedefs
namespace __gnu_parallel
{
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
class guarded_iterator;
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2);
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2);
@@ -80,7 +80,7 @@
* Deriving from RandomAccessIterator is not possible since
* RandomAccessIterator need not be a class.
*/
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
class guarded_iterator
{
private:
@@ -99,8 +99,8 @@
* @param end End iterator of sequence.
* @param comp Comparator provided for associated overloaded
* compare operators. */
- inline guarded_iterator(RandomAccessIterator begin,
- RandomAccessIterator end, Comparator& comp)
+ inline guarded_iterator(RandomAccessIterator begin,
+ RandomAccessIterator end, Comparator& comp)
: current(begin), end(end), comp(comp)
{ }
@@ -125,17 +125,21 @@
{ return current; }
friend bool
- operator< <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+ operator< <RandomAccessIterator, Comparator>(
+ guarded_iterator<RandomAccessIterator, Comparator>& bi1,
+ guarded_iterator<RandomAccessIterator, Comparator>& bi2);
friend bool
- operator<= <RandomAccessIterator, Comparator>(guarded_iterator<RandomAccessIterator, Comparator>& bi1, guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+ operator<= <RandomAccessIterator, Comparator>(
+ guarded_iterator<RandomAccessIterator, Comparator>& bi1,
+ guarded_iterator<RandomAccessIterator, Comparator>& bi2);
};
- /** @brief Compare two elements referenced by guarded iterators.
+/** @brief Compare two elements referenced by guarded iterators.
* @param bi1 First iterator.
* @param bi2 Second iterator.
* @return @c True if less. */
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -147,11 +151,11 @@
return (bi1.comp)(*bi1, *bi2); //normal compare
}
- /** @brief Compare two elements referenced by guarded iterators.
+/** @brief Compare two elements referenced by guarded iterators.
* @param bi1 First iterator.
* @param bi2 Second iterator.
* @return @c True if less equal. */
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
guarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -163,20 +167,20 @@
return !(bi1.comp)(*bi2, *bi1); //normal compare
}
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
class unguarded_iterator;
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
class unguarded_iterator
{
private:
@@ -190,8 +194,8 @@
* @param begin Begin iterator of sequence.
* @param end Unused, only for compatibility.
* @param comp Unused, only for compatibility. */
- inline unguarded_iterator(RandomAccessIterator begin,
- RandomAccessIterator end, Comparator& comp)
+ inline unguarded_iterator(RandomAccessIterator begin,
+ RandomAccessIterator end, Comparator& comp)
: current(begin), comp(comp)
{ }
@@ -206,7 +210,7 @@
/** @brief Dereference operator.
* @return Referenced element. */
- inline typename std::iterator_traits<RandomAccessIterator>::value_type
+ inline typename std::iterator_traits<RandomAccessIterator>::value_type
operator*()
{ return *current; }
@@ -217,17 +221,21 @@
{ return current; }
friend bool
- operator< <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
+ operator< <RandomAccessIterator, Comparator>(
+ unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
+ unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
friend bool
- operator<= <RandomAccessIterator, Comparator>(unguarded_iterator<RandomAccessIterator, Comparator>& bi1, unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
+ operator<= <RandomAccessIterator, Comparator>(
+ unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
+ unguarded_iterator<RandomAccessIterator, Comparator>& bi2);
};
- /** @brief Compare two elements referenced by unguarded iterators.
+/** @brief Compare two elements referenced by unguarded iterators.
* @param bi1 First iterator.
* @param bi2 Second iterator.
* @return @c True if less. */
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -236,11 +244,11 @@
return (bi1.comp)(*bi1, *bi2);
}
- /** @brief Compare two elements referenced by unguarded iterators.
+/** @brief Compare two elements referenced by unguarded iterators.
* @param bi1 First iterator.
* @param bi2 Second iterator.
* @return @c True if less equal. */
- template<typename RandomAccessIterator, typename Comparator>
+template<typename RandomAccessIterator, typename Comparator>
inline bool
operator<=(unguarded_iterator<RandomAccessIterator, Comparator>& bi1,
unguarded_iterator<RandomAccessIterator, Comparator>& bi2)
@@ -249,26 +257,30 @@
return !(bi1.comp)(*bi2, *bi1);
}
- /** Prepare a set of sequences to be merged without a (end) guard
+/** Prepare a set of sequences to be merged without a (end) guard
* @param seqs_begin
* @param seqs_end
* @param comp
* @param min_sequence
* @param stable
* @pre (seqs_end - seqs_begin > 0) */
- template<typename RandomAccessIteratorIterator, typename Comparator>
- typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type
+template<typename RandomAccessIteratorIterator, typename Comparator>
+ typename std::iterator_traits<
+ typename std::iterator_traits<RandomAccessIteratorIterator>::value_type
+ ::first_type>::difference_type
prepare_unguarded(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end, Comparator comp,
int& min_sequence, bool stable)
{
_GLIBCXX_CALL(seqs_end - seqs_begin)
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
- typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type
+ typedef typename std::iterator_traits<RandomAccessIterator1>
+ ::difference_type
difference_type;
if ((*seqs_begin).first == (*seqs_begin).second)
@@ -317,7 +329,8 @@
for (; s < (seqs_end - seqs_begin); s++)
{
- RandomAccessIterator1 split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second, min, comp);
+ RandomAccessIterator1 split = std::lower_bound(
+ seqs_begin[s].first, seqs_begin[s].second, min, comp);
overhang_size += seqs_begin[s].second - split;
}
@@ -325,23 +338,27 @@
return overhang_size;
}
- /** Prepare a set of sequences to be merged with a (end) guard (sentinel)
+/** Prepare a set of sequences to be merged with a (end) guard (sentinel)
* @param seqs_begin
* @param seqs_end
* @param comp */
- template<typename RandomAccessIteratorIterator, typename Comparator>
- typename std::iterator_traits<typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type>::difference_type
+template<typename RandomAccessIteratorIterator, typename Comparator>
+ typename std::iterator_traits<typename std::iterator_traits<
+ RandomAccessIteratorIterator>::value_type::first_type>::difference_type
prepare_unguarded_sentinel(RandomAccessIteratorIterator seqs_begin,
RandomAccessIteratorIterator seqs_end,
Comparator comp)
{
_GLIBCXX_CALL(seqs_end - seqs_begin)
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
- typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
+ typedef typename std::iterator_traits<RandomAccessIterator1>
+ ::value_type
value_type;
- typedef typename std::iterator_traits<RandomAccessIterator1>::difference_type
+ typedef typename std::iterator_traits<RandomAccessIterator1>
+ ::difference_type
difference_type;
// Last element in sequence.
@@ -362,8 +379,8 @@
difference_type overhang_size = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
{
- RandomAccessIterator1 split = std::lower_bound((*s).first, (*s).second,
- *max, comp);
+ RandomAccessIterator1 split =
+ std::lower_bound((*s).first, (*s).second, *max, comp);
overhang_size += (*s).second - split;
// Set sentinel.
@@ -374,7 +391,7 @@
return overhang_size;
}
- /** @brief Highly efficient 3-way merging procedure.
+/** @brief Highly efficient 3-way merging procedure.
* @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence.
@@ -382,15 +399,25 @@
* @param length Maximum length to merge.
* @param stable Unused, stable anyway.
* @return End iterator of output sequence. */
- template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ template<typename RAI, typename C> class iterator,
+ typename RandomAccessIteratorIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
- multiway_merge_3_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+ multiway_merge_3_variant(
+ RandomAccessIteratorIterator seqs_begin,
+ RandomAccessIteratorIterator seqs_end,
+ RandomAccessIterator3 target,
+ Comparator comp, _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length);
-
+
typedef _DifferenceTp difference_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
@@ -456,14 +483,23 @@
return target;
}
- template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ typename RandomAccessIteratorIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
- multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+ multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin,
+ RandomAccessIteratorIterator seqs_end,
+ RandomAccessIterator3 target,
+ Comparator comp,
+ _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length);
-
+
typedef _DifferenceTp difference_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
@@ -472,7 +508,8 @@
RandomAccessIterator3 target_end;
// Stable anyway.
- difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
+ difference_type overhang =
+ prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
difference_type total_length = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
@@ -480,7 +517,8 @@
if (overhang != -1)
{
- difference_type unguarded_length = std::min(length, total_length - overhang);
+ difference_type unguarded_length =
+ std::min(length, total_length - overhang);
target_end = multiway_merge_3_variant<unguarded_iterator>
(seqs_begin, seqs_end, target, comp, unguarded_length, stable);
overhang = length - unguarded_length;
@@ -527,7 +565,7 @@
return target_end;
}
- /** @brief Highly efficient 4-way merging procedure.
+/** @brief Highly efficient 4-way merging procedure.
* @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence.
@@ -535,14 +573,23 @@
* @param length Maximum length to merge.
* @param stable Unused, stable anyway.
* @return End iterator of output sequence. */
- template<template<typename RAI, typename C> class iterator, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ template<typename RAI, typename C> class iterator,
+ typename RandomAccessIteratorIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
- multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+ multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin,
+ RandomAccessIteratorIterator seqs_end,
+ RandomAccessIterator3 target,
+ Comparator comp, _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length);
typedef _DifferenceTp difference_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
@@ -633,14 +680,23 @@
return target;
}
- template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ typename RandomAccessIteratorIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
- multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+ multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin,
+ RandomAccessIteratorIterator seqs_end,
+ RandomAccessIterator3 target,
+ Comparator comp,
+ _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length);
typedef _DifferenceTp difference_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
@@ -649,7 +705,8 @@
RandomAccessIterator3 target_end;
// Stable anyway.
- difference_type overhang = prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
+ difference_type overhang =
+ prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
difference_type total_length = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
@@ -657,7 +714,8 @@
if (overhang != -1)
{
- difference_type unguarded_length = std::min(length, total_length - overhang);
+ difference_type unguarded_length =
+ std::min(length, total_length - overhang);
target_end = multiway_merge_4_variant<unguarded_iterator>
(seqs_begin, seqs_end, target, comp, unguarded_length, stable);
overhang = length - unguarded_length;
@@ -674,10 +732,13 @@
_GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
#endif
- std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > one_missing(seqs_begin, seqs_end);
+ std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> >
+ one_missing(seqs_begin, seqs_end);
one_missing.erase(one_missing.begin() + min_seq); //remove
- target_end = multiway_merge_3_variant<guarded_iterator>(one_missing.begin(), one_missing.end(), target_end, comp, overhang, stable);
+ target_end = multiway_merge_3_variant<guarded_iterator>(
+ one_missing.begin(), one_missing.end(),
+ target_end, comp, overhang, stable);
// Insert back again.
one_missing.insert(one_missing.begin() + min_seq, seqs_begin[min_seq]);
@@ -692,7 +753,7 @@
return target_end;
}
- /** @brief Basic multi-way merging procedure.
+/** @brief Basic multi-way merging procedure.
*
* The head elements are kept in a sorted array, new heads are
* inserted linearly.
@@ -702,16 +763,24 @@
* @param comp Comparator.
* @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty.
- * @return End iterator of output sequence.
+ * @return End iterator of output sequence.
*/
- template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ typename RandomAccessIteratorIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
- multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+ multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin,
+ RandomAccessIteratorIterator seqs_end,
+ RandomAccessIterator3 target,
+ Comparator comp, _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
@@ -719,7 +788,8 @@
// Num remaining pieces.
int k = static_cast<int>(seqs_end - seqs_begin), nrp;
- value_type* pl = static_cast<value_type*>(::operator new(sizeof(value_type) * k));
+ value_type* pl = static_cast<value_type*>(
+ ::operator new(sizeof(value_type) * k));
int* source = new int[k];
difference_type total_length = 0;
@@ -818,7 +888,8 @@
// Sink down.
j = 1;
while ((j < nrp) && (comp(pl[j], pl[j - 1]) ||
- (!comp(pl[j - 1], pl[j]) && (source[j] < source[j - 1]))))
+ (!comp(pl[j - 1], pl[j])
+ && (source[j] < source[j - 1]))))
{
std::swap(pl[j - 1], pl[j]);
std::swap(source[j - 1], source[j]);
@@ -869,7 +940,7 @@
return target;
}
- /** @brief Multi-way merging procedure for a high branching factor,
+/** @brief Multi-way merging procedure for a high branching factor,
* guarded case.
*
* The head elements are kept in a loser tree.
@@ -879,16 +950,26 @@
* @param comp Comparator.
* @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty.
- * @return End iterator of output sequence.
+ * @return End iterator of output sequence.
*/
- template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ typename LT,
+ typename RandomAccessIteratorIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
- multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+ multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin,
+ RandomAccessIteratorIterator seqs_end,
+ RandomAccessIterator3 target,
+ Comparator comp,
+ _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
@@ -978,7 +1059,7 @@
return target;
}
- /** @brief Multi-way merging procedure for a high branching factor,
+/** @brief Multi-way merging procedure for a high branching factor,
* unguarded case.
*
* The head elements are kept in a loser tree.
@@ -989,16 +1070,25 @@
* @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty.
* @return End iterator of output sequence.
- * @pre No input will run out of elements during the merge.
+ * @pre No input will run out of elements during the merge.
*/
- template<typename LT, typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ typename LT,
+ typename RandomAccessIteratorIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp, typename Comparator>
RandomAccessIterator3
- multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+ multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin,
+ RandomAccessIteratorIterator seqs_end,
+ RandomAccessIterator3 target,
+ Comparator comp,
+ _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
@@ -1045,13 +1135,16 @@
source = lt.get_min_source();
#if _GLIBCXX_ASSERTIONS
- _GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1)));
+ _GLIBCXX_PARALLEL_ASSERT(i == 0
+ || !comp(*(seqs_begin[source].first), *(target - 1)));
#endif
*(target++) = *(seqs_begin[source].first++);
#if _GLIBCXX_ASSERTIONS
- _GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i == length - 1));
+ _GLIBCXX_PARALLEL_ASSERT(
+ (seqs_begin[source].first != seqs_begin[source].second)
+ || (i == length - 1));
i++;
#endif
// Feed.
@@ -1071,15 +1164,19 @@
#if _GLIBCXX_ASSERTIONS
if (i > 0 && comp(*(seqs_begin[source].first), *(target - 1)))
printf(" %i %i %i\n", length, i, source);
- _GLIBCXX_PARALLEL_ASSERT(i == 0 || !comp(*(seqs_begin[source].first), *(target - 1)));
+ _GLIBCXX_PARALLEL_ASSERT(i == 0
+ || !comp(*(seqs_begin[source].first), *(target - 1)));
#endif
*(target++) = *(seqs_begin[source].first++);
#if _GLIBCXX_ASSERTIONS
- if (!((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1)))
+ if (!((seqs_begin[source].first != seqs_begin[source].second)
+ || (i >= length - 1)))
printf(" %i %i %i\n", length, i, source);
- _GLIBCXX_PARALLEL_ASSERT((seqs_begin[source].first != seqs_begin[source].second) || (i >= length - 1));
+ _GLIBCXX_PARALLEL_ASSERT(
+ (seqs_begin[source].first != seqs_begin[source].second)
+ || (i >= length - 1));
i++;
#endif
// Feed.
@@ -1091,15 +1188,24 @@
return target;
}
- template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ typename RandomAccessIteratorIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
- multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+ multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin,
+ RandomAccessIteratorIterator seqs_end,
+ RandomAccessIterator3 target,
+ Comparator comp,
+ _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
@@ -1115,7 +1221,8 @@
if (overhang != -1)
{
- difference_type unguarded_length = std::min(length, total_length - overhang);
+ difference_type unguarded_length =
+ std::min(length, total_length - overhang);
target_end = multiway_merge_loser_tree_unguarded
<typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
(seqs_begin, seqs_end, target, comp, unguarded_length, stable);
@@ -1145,23 +1252,31 @@
return target_end;
}
- template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ typename RandomAccessIteratorIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
- multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable)
+ multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin,
+ RandomAccessIteratorIterator seqs_end,
+ RandomAccessIterator3 target,
+ Comparator comp,
+ _DifferenceTp length, bool stable)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
typedef std::iterator_traits<RandomAccessIteratorIterator> traits_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
- RandomAccessIterator1;
RandomAccessIterator3 target_end;
- difference_type overhang = prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);
+ difference_type overhang =
+ prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);
difference_type total_length = 0;
for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; s++)
@@ -1172,7 +1287,8 @@
(*s).second++;
}
- difference_type unguarded_length = std::min(length, total_length - overhang);
+ difference_type unguarded_length =
+ std::min(length, total_length - overhang);
target_end = multiway_merge_loser_tree_unguarded
<typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
(seqs_begin, seqs_end, target, comp, unguarded_length, stable);
@@ -1184,12 +1300,15 @@
#endif
// Copy rest stable.
- for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end && overhang > 0; s++)
+ for (RandomAccessIteratorIterator s = seqs_begin;
+ s != seqs_end && overhang > 0; s++)
{
// Restore.
(*s).second--;
- difference_type local_length = std::min((difference_type)overhang, (difference_type)LENGTH(*s));
- target_end = std::copy((*s).first, (*s).first + local_length, target_end);
+ difference_type local_length =
+ std::min<difference_type>(overhang, LENGTH(*s));
+ target_end = std::copy((*s).first, (*s).first + local_length,
+ target_end);
(*s).first += local_length;
overhang -= local_length;
}
@@ -1203,7 +1322,7 @@
return target_end;
}
- /** @brief Sequential multi-way merging switch.
+/** @brief Sequential multi-way merging switch.
*
* The decision if based on the branching factor and runtime settings.
* @param seqs_begin Begin iterator of iterator pair input sequence.
@@ -1214,14 +1333,24 @@
* @param stable Stable merging incurs a performance penalty.
* @param sentinel The sequences have a sentinel element.
* @return End iterator of output sequence. */
- template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ typename RandomAccessIteratorIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
- multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel, sequential_tag)
+ multiway_merge(RandomAccessIteratorIterator seqs_begin,
+ RandomAccessIteratorIterator seqs_end,
+ RandomAccessIterator3 target,
+ Comparator comp, _DifferenceTp length,
+ bool stable, bool sentinel,
+ sequential_tag)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
@@ -1234,7 +1363,8 @@
RandomAccessIterator3 return_target = target;
int k = static_cast<int>(seqs_end - seqs_begin);
- Settings::MultiwayMergeAlgorithm mwma = Settings::multiway_merge_algorithm;
+ Settings::MultiwayMergeAlgorithm mwma =
+ Settings::multiway_merge_algorithm;
if (!sentinel && mwma == Settings::LOSER_TREE_SENTINEL)
mwma = Settings::LOSER_TREE_COMBINED;
@@ -1244,23 +1374,40 @@
case 0:
break;
case 1:
- return_target = std::copy(seqs_begin[0].first, seqs_begin[0].first + length, target);
+ return_target = std::copy(seqs_begin[0].first,
+ seqs_begin[0].first + length,
+ target);
seqs_begin[0].first += length;
break;
case 2:
- return_target = merge_advance(seqs_begin[0].first, seqs_begin[0].second, seqs_begin[1].first, seqs_begin[1].second, target, length, comp);
+ return_target = merge_advance(seqs_begin[0].first,
+ seqs_begin[0].second,
+ seqs_begin[1].first,
+ seqs_begin[1].second,
+ target, length, comp);
break;
case 3:
switch (mwma)
{
case Settings::LOSER_TREE_COMBINED:
- return_target = multiway_merge_3_combined(seqs_begin, seqs_end, target, comp, length, stable);
+ return_target = multiway_merge_3_combined(seqs_begin,
+ seqs_end,
+ target,
+ comp, length, stable);
break;
case Settings::LOSER_TREE_SENTINEL:
- return_target = multiway_merge_3_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+ return_target = multiway_merge_3_variant<unguarded_iterator>(
+ seqs_begin,
+ seqs_end,
+ target,
+ comp, length, stable);
break;
default:
- return_target = multiway_merge_3_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+ return_target = multiway_merge_3_variant<guarded_iterator>(
+ seqs_begin,
+ seqs_end,
+ target,
+ comp, length, stable);
break;
}
break;
@@ -1268,13 +1415,25 @@
switch (mwma)
{
case Settings::LOSER_TREE_COMBINED:
- return_target = multiway_merge_4_combined(seqs_begin, seqs_end, target, comp, length, stable);
+ return_target = multiway_merge_4_combined(
+ seqs_begin,
+ seqs_end,
+ target,
+ comp, length, stable);
break;
case Settings::LOSER_TREE_SENTINEL:
- return_target = multiway_merge_4_variant<unguarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+ return_target = multiway_merge_4_variant<unguarded_iterator>(
+ seqs_begin,
+ seqs_end,
+ target,
+ comp, length, stable);
break;
default:
- return_target = multiway_merge_4_variant<guarded_iterator>(seqs_begin, seqs_end, target, comp, length, stable);
+ return_target = multiway_merge_4_variant<guarded_iterator>(
+ seqs_begin,
+ seqs_end,
+ target,
+ comp, length, stable);
break;
}
break;
@@ -1283,26 +1442,48 @@
switch (mwma)
{
case Settings::BUBBLE:
- return_target = multiway_merge_bubble(seqs_begin, seqs_end, target, comp, length, stable);
+ return_target = multiway_merge_bubble(
+ seqs_begin,
+ seqs_end,
+ target,
+ comp, length, stable);
break;
#if _GLIBCXX_LOSER_TREE_EXPLICIT
case Settings::LOSER_TREE_EXPLICIT:
- return_target = multiway_merge_loser_tree<LoserTreeExplicit<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable);
+ return_target = multiway_merge_loser_tree<
+ LoserTreeExplicit<value_type, Comparator> >(
+ seqs_begin,
+ seqs_end,
+ target,
+ comp, length, stable);
break;
#endif
#if _GLIBCXX_LOSER_TREE
case Settings::LOSER_TREE:
- return_target = multiway_merge_loser_tree<LoserTree<value_type, Comparator> >(seqs_begin, seqs_end, target, comp, length, stable);
+ return_target = multiway_merge_loser_tree<
+ LoserTree<value_type, Comparator> >(
+ seqs_begin,
+ seqs_end,
+ target,
+ comp, length, stable);
break;
#endif
#if _GLIBCXX_LOSER_TREE_COMBINED
case Settings::LOSER_TREE_COMBINED:
- return_target = multiway_merge_loser_tree_combined(seqs_begin, seqs_end, target, comp, length, stable);
+ return_target = multiway_merge_loser_tree_combined(
+ seqs_begin,
+ seqs_end,
+ target,
+ comp, length, stable);
break;
#endif
#if _GLIBCXX_LOSER_TREE_SENTINEL
case Settings::LOSER_TREE_SENTINEL:
- return_target = multiway_merge_loser_tree_sentinel(seqs_begin, seqs_end, target, comp, length, stable);
+ return_target = multiway_merge_loser_tree_sentinel(
+ seqs_begin,
+ seqs_end,
+ target,
+ comp, length, stable);
break;
#endif
default:
@@ -1319,7 +1500,7 @@
return return_target;
}
- /** @brief Parallel multi-way merge routine.
+/** @brief Parallel multi-way merge routine.
*
* The decision if based on the branching factor and runtime settings.
* @param seqs_begin Begin iterator of iterator pair input sequence.
@@ -1329,30 +1510,35 @@
* @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty.
* @param sentinel Ignored.
- * @return End iterator of output sequence.
+ * @return End iterator of output sequence.
*/
- template<typename RandomAccessIteratorIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ typename RandomAccessIteratorIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
- parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin, RandomAccessIteratorIterator seqs_end, RandomAccessIterator3 target, Comparator comp, _DifferenceTp length, bool stable, bool sentinel)
+ parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin,
+ RandomAccessIteratorIterator seqs_end,
+ RandomAccessIterator3 target,
+ Comparator comp,
+ _DifferenceTp length, bool stable, bool sentinel)
{
_GLIBCXX_CALL(length)
typedef _DifferenceTp difference_type;
- typedef typename std::iterator_traits<RandomAccessIteratorIterator>::value_type::first_type
+ typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+ ::value_type::first_type
RandomAccessIterator1;
typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
value_type;
-#if _GLIBCXX_ASSERTIONS
- for (RandomAccessIteratorIterator rii = seqs_begin; rii != seqs_end; rii++)
- _GLIBCXX_PARALLEL_ASSERT(is_sorted((*rii).first, (*rii).second, comp));
-#endif
-
// k sequences.
int k = static_cast<int>(seqs_end - seqs_begin);
difference_type total_length = 0;
- for (RandomAccessIteratorIterator raii = seqs_begin; raii != seqs_end; raii++)
+ for (RandomAccessIteratorIterator raii = seqs_begin;
+ raii != seqs_end; raii++)
total_length += LENGTH(*raii);
_GLIBCXX_CALL(total_length)
@@ -1360,32 +1546,50 @@
if (total_length == 0 || k == 0)
return target;
- thread_index_t num_threads = static_cast<thread_index_t>(std::min(static_cast<difference_type>(get_max_threads()), total_length));
-
bool tight = (total_length == length);
+ std::vector<std::pair<difference_type, difference_type> >* pieces;
+
+ thread_index_t num_threads = static_cast<thread_index_t>(
+ std::min<difference_type>(get_max_threads(), total_length));
+
+# pragma omp parallel num_threads (num_threads)
+ {
+# pragma omp single
+ {
+ num_threads = omp_get_num_threads();
// Thread t will have to merge pieces[iam][0..k - 1]
- std::vector<std::pair<difference_type, difference_type> >* pieces = new std::vector<std::pair<difference_type, difference_type> >[num_threads];
+ pieces = new std::vector<
+ std::pair<difference_type, difference_type> >[num_threads];
for (int s = 0; s < num_threads; s++)
pieces[s].resize(k);
- difference_type num_samples = Settings::merge_oversampling * num_threads;
+ difference_type num_samples =
+ Settings::merge_oversampling * num_threads;
if (Settings::multiway_merge_splitting == Settings::SAMPLING)
{
- value_type* samples = static_cast<value_type*>(::operator new(sizeof(value_type) * k * num_samples));
+ value_type* samples = static_cast<value_type*>(
+ ::operator new(sizeof(value_type) * k * num_samples));
// Sample.
for (int s = 0; s < k; s++)
for (int i = 0; (difference_type)i < num_samples; i++)
{
- difference_type sample_index = static_cast<difference_type>(LENGTH(seqs_begin[s]) * (double(i + 1) / (num_samples + 1)) * (double(length) / total_length));
- samples[s * num_samples + i] = seqs_begin[s].first[sample_index];
+ difference_type sample_index =
+ static_cast<difference_type>(
+ LENGTH(seqs_begin[s]) * (double(i + 1) /
+ (num_samples + 1)) * (double(length)
+ / total_length));
+ samples[s * num_samples + i] =
+ seqs_begin[s].first[sample_index];
}
if (stable)
- __gnu_sequential::stable_sort(samples, samples + (num_samples * k), comp);
+ __gnu_sequential::stable_sort(
+ samples, samples + (num_samples * k), comp);
else
- __gnu_sequential::sort(samples, samples + (num_samples * k), comp);
+ __gnu_sequential::sort(
+ samples, samples + (num_samples * k), comp);
for (int slab = 0; slab < num_threads; slab++)
// For each slab / processor.
@@ -1393,42 +1597,59 @@
{
// For each sequence.
if (slab > 0)
- pieces[slab][seq].first = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * slab / num_threads], comp) - seqs_begin[seq].first;
+ pieces[slab][seq].first =
+ std::upper_bound(
+ seqs_begin[seq].first,
+ seqs_begin[seq].second,
+ samples[num_samples * k * slab / num_threads],
+ comp)
+ - seqs_begin[seq].first;
else
{
// Absolute beginning.
pieces[slab][seq].first = 0;
}
if ((slab + 1) < num_threads)
- pieces[slab][seq].second = std::upper_bound(seqs_begin[seq].first, seqs_begin[seq].second, samples[num_samples * k * (slab + 1) / num_threads], comp) - seqs_begin[seq].first;
+ pieces[slab][seq].second =
+ std::upper_bound(
+ seqs_begin[seq].first,
+ seqs_begin[seq].second,
+ samples[num_samples * k * (slab + 1) /
+ num_threads], comp)
+ - seqs_begin[seq].first;
else
- pieces[slab][seq].second = LENGTH(seqs_begin[seq]); //absolute ending
+ pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
}
delete[] samples;
}
else
{
// (Settings::multiway_merge_splitting == Settings::EXACT).
- std::vector<RandomAccessIterator1>* offsets = new std::vector<RandomAccessIterator1>[num_threads];
- std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> > se(k);
+ std::vector<RandomAccessIterator1>* offsets =
+ new std::vector<RandomAccessIterator1>[num_threads];
+ std::vector<
+ std::pair<RandomAccessIterator1, RandomAccessIterator1>
+ > se(k);
copy(seqs_begin, seqs_end, se.begin());
- difference_type* borders = static_cast<difference_type*>(__builtin_alloca(sizeof(difference_type) * (num_threads + 1)));
+ difference_type* borders =
+ new difference_type[num_threads + 1];
equally_split(length, num_threads, borders);
for (int s = 0; s < (num_threads - 1); s++)
{
offsets[s].resize(k);
- multiseq_partition(se.begin(), se.end(), borders[s + 1],
+ multiseq_partition(
+ se.begin(), se.end(), borders[s + 1],
offsets[s].begin(), comp);
// Last one also needed and available.
if (!tight)
{
offsets[num_threads - 1].resize(k);
- multiseq_partition(se.begin(), se.end(),
- difference_type(length),
+ multiseq_partition(se.begin(), se.end(),
+ difference_type(length),
offsets[num_threads - 1].begin(), comp);
}
}
@@ -1446,21 +1667,23 @@
pieces[slab][seq].first = 0;
}
else
- pieces[slab][seq].first = pieces[slab - 1][seq].second;
+ pieces[slab][seq].first =
+ pieces[slab - 1][seq].second;
if (!tight || slab < (num_threads - 1))
- pieces[slab][seq].second = offsets[slab][seq] - seqs_begin[seq].first;
+ pieces[slab][seq].second =
+ offsets[slab][seq] - seqs_begin[seq].first;
else
{
// slab == num_threads - 1
- pieces[slab][seq].second = LENGTH(seqs_begin[seq]);
+ pieces[slab][seq].second =
+ LENGTH(seqs_begin[seq]);
}
}
}
delete[] offsets;
}
+ } //single
-# pragma omp parallel num_threads(num_threads)
- {
thread_index_t iam = omp_get_thread_num();
difference_type target_position = 0;
@@ -1470,16 +1693,21 @@
if (k > 2)
{
- std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks = new std::pair<RandomAccessIterator1, RandomAccessIterator1>[k];
+ std::pair<RandomAccessIterator1, RandomAccessIterator1>* chunks
+ = new
+ std::pair<RandomAccessIterator1, RandomAccessIterator1>[k];
difference_type local_length = 0;
for (int s = 0; s < k; s++)
{
- chunks[s] = std::make_pair(seqs_begin[s].first + pieces[iam][s].first, seqs_begin[s].first + pieces[iam][s].second);
+ chunks[s] = std::make_pair(
+ seqs_begin[s].first + pieces[iam][s].first,
+ seqs_begin[s].first + pieces[iam][s].second);
local_length += LENGTH(chunks[s]);
}
- multiway_merge(chunks, chunks + k, target + target_position, comp,
+ multiway_merge(
+ chunks, chunks + k, target + target_position, comp,
std::min(local_length, length - target_position),
stable, false, sequential_tag());
@@ -1487,16 +1715,19 @@
}
else if (k == 2)
{
- RandomAccessIterator1 begin0 = seqs_begin[0].first + pieces[iam][0].first, begin1 = seqs_begin[1].first + pieces[iam][1].first;
+ RandomAccessIterator1
+ begin0 = seqs_begin[0].first + pieces[iam][0].first,
+ begin1 = seqs_begin[1].first + pieces[iam][1].first;
merge_advance(begin0,
seqs_begin[0].first + pieces[iam][0].second,
begin1,
seqs_begin[1].first + pieces[iam][1].second,
target + target_position,
- (pieces[iam][0].second - pieces[iam][0].first) + (pieces[iam][1].second - pieces[iam][1].first),
+ (pieces[iam][0].second - pieces[iam][0].first) +
+ (pieces[iam][1].second - pieces[iam][1].first),
comp);
}
- }
+ } //parallel
#if _GLIBCXX_ASSERTIONS
_GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
@@ -1511,7 +1742,7 @@
return target + length;
}
- /**
+/**
* @brief Multi-way merging front-end.
* @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence.
@@ -1519,9 +1750,13 @@
* @param comp Comparator.
* @param length Maximum length to merge.
* @param stable Stable merging incurs a performance penalty.
- * @return End iterator of output sequence.
+ * @return End iterator of output sequence.
*/
- template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ typename RandomAccessIteratorPairIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
multiway_merge(RandomAccessIteratorPairIterator seqs_begin,
RandomAccessIteratorPairIterator seqs_end,
@@ -1535,15 +1770,21 @@
return target;
RandomAccessIterator3 target_end;
- if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
- target_end = parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (difference_type)length, stable, false);
+ if (_GLIBCXX_PARALLEL_CONDITION(
+ ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
+ && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
+ target_end = parallel_multiway_merge(
+ seqs_begin, seqs_end,
+ target, comp, static_cast<difference_type>(length), stable, false);
else
- target_end = multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, false, sequential_tag());
+ target_end = multiway_merge(
+ seqs_begin, seqs_end,
+ target, comp, length, stable, false, sequential_tag());
return target_end;
}
- /** @brief Multi-way merging front-end.
+/** @brief Multi-way merging front-end.
* @param seqs_begin Begin iterator of iterator pair input sequence.
* @param seqs_end End iterator of iterator pair input sequence.
* @param target Begin iterator out output sequence.
@@ -1554,7 +1795,11 @@
* @pre For each @c i, @c seqs_begin[i].second must be the end
* marker of the sequence, but also reference the one more sentinel
* element. */
- template<typename RandomAccessIteratorPairIterator, typename RandomAccessIterator3, typename _DifferenceTp, typename Comparator>
+template<
+ typename RandomAccessIteratorPairIterator,
+ typename RandomAccessIterator3,
+ typename _DifferenceTp,
+ typename Comparator>
RandomAccessIterator3
multiway_merge_sentinel(RandomAccessIteratorPairIterator seqs_begin,
RandomAccessIteratorPairIterator seqs_end,
@@ -1570,10 +1815,16 @@
_GLIBCXX_CALL(seqs_end - seqs_begin)
- if (_GLIBCXX_PARALLEL_CONDITION(((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k) && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
- return parallel_multiway_merge(seqs_begin, seqs_end, target, comp, (typename std::iterator_traits<RandomAccessIterator3>::difference_type)length, stable, true);
+ if (_GLIBCXX_PARALLEL_CONDITION(
+ ((seqs_end - seqs_begin) >= Settings::multiway_merge_minimal_k)
+ && ((sequence_index_t)length >= Settings::multiway_merge_minimal_n)))
+ return parallel_multiway_merge(
+ seqs_begin, seqs_end,
+ target, comp, static_cast<difference_type>(length), stable, true);
else
- return multiway_merge(seqs_begin, seqs_end, target, comp, length, stable, true, sequential_tag());
+ return multiway_merge(
+ seqs_begin, seqs_end,
+ target, comp, length, stable, true, sequential_tag());
}
}
Index: include/parallel/losertree.h
===================================================================
--- include/parallel/losertree.h (revision 130225)
+++ include/parallel/losertree.h (working copy)
@@ -29,9 +29,9 @@
// Public License.
/** @file parallel/losertree.h
- * @brief Many generic loser tree variants.
- * This file is a GNU parallel extension to the Standard C++ Library.
- */
+* @brief Many generic loser tree variants.
+* This file is a GNU parallel extension to the Standard C++ Library.
+*/
// Written by Johannes Singler.
@@ -49,13 +49,13 @@
#if _GLIBCXX_LOSER_TREE_EXPLICIT
- /** @brief Guarded loser tree, copying the whole element into the
- * tree structure.
- *
- * Guarding is done explicitly through two flags per element, inf
- * and sup This is a quite slow variant.
- */
- template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, copying the whole element into the
+* tree structure.
+*
+* Guarding is done explicitly through two flags per element, inf
+* and sup This is a quite slow variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
class LoserTreeExplicit
{
private:
@@ -76,7 +76,9 @@
Comparator comp;
public:
- inline LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>()) : comp(_comp)
+ inline
+ LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>())
+ : comp(_comp)
{
size = _size;
offset = size;
@@ -93,9 +95,6 @@
inline ~LoserTreeExplicit()
{ delete[] losers; }
- inline void
- print() { }
-
inline int
get_min_source()
{ return losers[0].source; }
@@ -106,7 +105,8 @@
bool inf = false;
for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
{
- if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup && comp(losers[pos].key, key)) || losers[pos].inf || sup)
+ if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
+ && comp(losers[pos].key, key)) || losers[pos].inf || sup)
{
// The other one is smaller.
std::swap(losers[pos].key, key);
@@ -133,7 +133,7 @@
for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
{
// The smaller one gets promoted.
- if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
+ if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
&& comp(losers[pos].key, key))
|| losers[pos].inf || sup)
{
@@ -209,14 +209,14 @@
#if _GLIBCXX_LOSER_TREE
- /** @brief Guarded loser tree, either copying the whole element into
- * the tree structure, or looking up the element via the index.
- *
- * Guarding is done explicitly through one flag sup per element,
- * inf is not needed due to a better initialization routine. This
- * is a well-performing variant.
- */
- template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, either copying the whole element into
+* the tree structure, or looking up the element via the index.
+*
+* Guarding is done explicitly through one flag sup per element,
+* inf is not needed due to a better initialization routine. This
+* is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
class LoserTree
{
private:
@@ -240,7 +240,7 @@
// Next greater power of 2.
k = 1 << (log2(ik - 1) + 1);
offset = k;
- losers = new Loser[k * 2];
+ losers = static_cast<Loser*>(::operator new(k * 2 * sizeof(Loser)));
for (unsigned int i = ik - 1; i < k; i++)
losers[i + k].sup = true;
}
@@ -248,14 +248,6 @@
inline ~LoserTree()
{ delete[] losers; }
- void
- print()
- {
- for (unsigned int i = 0; i < (k * 2); i++)
- printf("%d %d from %d, %d\n", i, losers[i].key,
- losers[i].source, losers[i].sup);
- }
-
inline int
get_min_source()
{ return losers[0].source; }
@@ -267,7 +259,7 @@
losers[pos].sup = sup;
losers[pos].source = source;
- losers[pos].key = key;
+ new(&(losers[pos].key)) T(key);
}
unsigned int
@@ -282,7 +274,8 @@
unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1);
if (losers[right].sup ||
- (!losers[left].sup && !comp(losers[right].key, losers[left].key)))
+ (!losers[left].sup
+ && !comp(losers[right].key, losers[left].key)))
{
// Left one is less or equal.
losers[root] = losers[right];
@@ -337,8 +330,9 @@
{
unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1);
- if ( losers[right].sup ||
- (!losers[left].sup && !comp(losers[right].key, losers[left].key)))
+ if (losers[right].sup
+ || (!losers[left].sup
+ && !comp(losers[right].key, losers[left].key)))
{
// Left one is less or equal.
losers[root] = losers[right];
@@ -365,10 +359,11 @@
for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
{
// The smaller one gets promoted, ties are broken by source.
- if ( (sup && (!losers[pos].sup || losers[pos].source < source)) ||
- (!sup && !losers[pos].sup &&
- ((comp(losers[pos].key, key)) ||
- (!comp(key, losers[pos].key) && losers[pos].source < source))))
+ if ( (sup && (!losers[pos].sup || losers[pos].source < source))
+ || (!sup && !losers[pos].sup
+ && ((comp(losers[pos].key, key))
+ || (!comp(key, losers[pos].key)
+ && losers[pos].source < source))))
{
// The other one is smaller.
std::swap(losers[pos].sup, sup);
@@ -387,14 +382,14 @@
#if _GLIBCXX_LOSER_TREE_REFERENCE
- /** @brief Guarded loser tree, either copying the whole element into
- * the tree structure, or looking up the element via the index.
- *
- * Guarding is done explicitly through one flag sup per element,
- * inf is not needed due to a better initialization routine. This
- * is a well-performing variant.
- */
- template<typename T, typename Comparator = std::less<T> >
+/** @brief Guarded loser tree, either copying the whole element into
+* the tree structure, or looking up the element via the index.
+*
+* Guarding is done explicitly through one flag sup per element,
+* inf is not needed due to a better initialization routine. This
+* is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
class LoserTreeReference
{
#undef COPY
@@ -423,7 +418,9 @@
Comparator comp;
public:
- inline LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+ inline
+ LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>())
+ : comp(_comp)
{
ik = _k;
@@ -446,13 +443,6 @@
#endif
}
- void
- print()
- {
- for (unsigned int i = 0; i < (k * 2); i++)
- printf("%d %d from %d, %d\n", i, KEY(i), losers[i].source, losers[i].sup);
- }
-
inline int
get_min_source()
{ return losers[0].source; }
@@ -570,7 +560,8 @@
if ( (sup && (!losers[pos].sup || losers[pos].source < source)) ||
(!sup && !losers[pos].sup &&
((comp(KEY(pos), KEY_SOURCE(source))) ||
- (!comp(KEY_SOURCE(source), KEY(pos)) && losers[pos].source < source))))
+ (!comp(KEY_SOURCE(source), KEY(pos))
+ && losers[pos].source < source))))
{
// The other one is smaller.
std::swap(losers[pos].sup, sup);
@@ -595,13 +586,13 @@
#if _GLIBCXX_LOSER_TREE_POINTER
- /** @brief Guarded loser tree, either copying the whole element into
+/** @brief Guarded loser tree, either copying the whole element into
the tree structure, or looking up the element via the index.
- * Guarding is done explicitly through one flag sup per element,
- * inf is not needed due to a better initialization routine.
- * This is a well-performing variant.
- */
- template<typename T, typename Comparator = std::less<T> >
+* Guarding is done explicitly through one flag sup per element,
+* inf is not needed due to a better initialization routine.
+* This is a well-performing variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
class LoserTreePointer
{
private:
@@ -617,7 +608,9 @@
Comparator comp;
public:
- inline LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+ inline
+ LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>())
+ : comp(_comp)
{
ik = _k;
@@ -632,13 +625,6 @@
inline ~LoserTreePointer()
{ delete[] losers; }
- void
- print()
- {
- for (unsigned int i = 0; i < (k * 2); i++)
- printf("%d %d from %d, %d\n", i, losers[i].keyp, losers[i].source, losers[i].sup);
- }
-
inline int
get_min_source()
{ return losers[0].source; }
@@ -664,8 +650,9 @@
{
unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1);
- if ( losers[right].sup ||
- (!losers[left].sup && !comp(*losers[right].keyp, *losers[left].keyp)))
+ if (losers[right].sup
+ || (!losers[left].sup
+ && !comp(*losers[right].keyp, *losers[left].keyp)))
{
// Left one is less or equal.
losers[root] = losers[right];
@@ -684,7 +671,7 @@
init()
{ losers[0] = losers[init_winner(1)]; }
- inline void
+ inline void
delete_min_insert(const T& key, bool sup)
{
const T* keyp = &key;
@@ -722,7 +709,7 @@
unsigned int left = init_winner (2 * root);
unsigned int right = init_winner (2 * root + 1);
if (losers[right].sup
- || (!losers[left].sup && !comp(*losers[right].keyp,
+ || (!losers[left].sup && !comp(*losers[right].keyp,
*losers[left].keyp)))
{
// Left one is less or equal.
@@ -753,7 +740,7 @@
if ( (sup && (!losers[pos].sup || losers[pos].source < source)) ||
(!sup && !losers[pos].sup &&
((comp(*losers[pos].keyp, *keyp)) ||
- (!comp(*keyp, *losers[pos].keyp)
+ (!comp(*keyp, *losers[pos].keyp)
&& losers[pos].source < source))))
{
// The other one is smaller.
@@ -773,13 +760,13 @@
#if _GLIBCXX_LOSER_TREE_UNGUARDED
- /** @brief Unguarded loser tree, copying the whole element into the
- * tree structure.
- *
- * No guarding is done, therefore not a single input sequence must
- * run empty. This is a very fast variant.
- */
- template<typename T, typename Comparator = std::less<T> >
+/** @brief Unguarded loser tree, copying the whole element into the
+* tree structure.
+*
+* No guarding is done, therefore not a single input sequence must
+* run empty. This is a very fast variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
class LoserTreeUnguarded
{
private:
@@ -809,7 +796,9 @@
}
public:
- inline LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+ inline
+ LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>())
+ : comp(_comp)
{
ik = _k;
// Next greater or equal power of 2.
@@ -826,13 +815,6 @@
delete[] mapping;
}
- void
- print()
- {
- for (unsigned int i = 0; i < k + ik; i++)
- printf("%d %d from %d\n", i, losers[i].key, losers[i].source);
- }
-
inline int
get_min_source()
{ return losers[0].source; }
@@ -855,7 +837,8 @@
// Next greater or equal power of 2.
unsigned int division = 1 << (log2(end - begin - 1));
unsigned int left = init_winner(2 * root, begin, begin + division);
- unsigned int right = init_winner(2 * root + 1, begin + division, end);
+ unsigned int right =
+ init_winner(2 * root + 1, begin + division, end);
if (!comp(losers[right].key, losers[left].key))
{
// Left one is less or equal.
@@ -912,7 +895,8 @@
{
// The smaller one gets promoted, ties are broken by source.
if (comp(losers[pos].key, keyr)
- || (!comp(keyr, losers[pos].key) && losers[pos].source < source))
+ || (!comp(keyr, losers[pos].key)
+ && losers[pos].source < source))
{
// The other one is smaller.
std::swap(losers[pos].source, source);
@@ -926,13 +910,13 @@
#if _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED
- /** @brief Unguarded loser tree, keeping only pointers to the
- * elements in the tree structure.
- *
- * No guarding is done, therefore not a single input sequence must
- * run empty. This is a very fast variant.
- */
- template<typename T, typename Comparator = std::less<T> >
+/** @brief Unguarded loser tree, keeping only pointers to the
+* elements in the tree structure.
+*
+* No guarding is done, therefore not a single input sequence must
+* run empty. This is a very fast variant.
+*/
+template<typename T, typename Comparator = std::less<T> >
class LoserTreePointerUnguarded
{
private:
@@ -961,7 +945,10 @@
}
public:
- inline LoserTreePointerUnguarded(unsigned int _k, Comparator _comp = std::less<T>()) : comp(_comp)
+ inline
+ LoserTreePointerUnguarded(unsigned int _k,
+ Comparator _comp = std::less<T>())
+ : comp(_comp)
{
ik = _k;
@@ -979,13 +966,6 @@
delete[] mapping;
}
- void
- print()
- {
- for (unsigned int i = 0; i < k + ik; i++)
- printf("%d %d from %d\n", i, *losers[i].keyp, losers[i].source);
- }
-
inline int
get_min_source()
{ return losers[0].source; }
@@ -1008,7 +988,8 @@
// Next greater or equal power of 2.
unsigned int division = 1 << (log2(end - begin - 1));
unsigned int left = init_winner(2 * root, begin, begin + division);
- unsigned int right = init_winner(2 * root + 1, begin + division, end);
+ unsigned int right
+ = init_winner(2 * root + 1, begin + division, end);
if (!comp(*losers[right].keyp, *losers[left].keyp))
{
// Left one is less or equal.
@@ -1066,7 +1047,7 @@
{
// The smaller one gets promoted, ties are broken by source.
if (comp(*losers[pos].keyp, *keyp)
- || (!comp(*keyp, *losers[pos].keyp)
+ || (!comp(*keyp, *losers[pos].keyp)
&& losers[pos].source < source))
{
// The other one is smaller.
@@ -1079,7 +1060,7 @@
};
#endif
- template<typename _ValueTp, class Comparator>
+template<typename _ValueTp, class Comparator>
struct loser_tree_traits
{
#if _GLIBCXX_LOSER_TREE
@@ -1093,7 +1074,7 @@
#endif
};
- template<typename _ValueTp, class Comparator>
+template<typename _ValueTp, class Comparator>
struct loser_tree_unguarded_traits
{
#if _GLIBCXX_LOSER_TREE_UNGUARDED
Index: include/parallel/workstealing.h
===================================================================
--- include/parallel/workstealing.h (revision 130225)
+++ include/parallel/workstealing.h (working copy)
@@ -55,8 +55,8 @@
#define _GLIBCXX_JOB_VOLATILE volatile
- /** @brief One job for a certain thread. */
- template<typename _DifferenceTp>
+/** @brief One job for a certain thread. */
+template<typename _DifferenceTp>
struct Job
{
typedef _DifferenceTp difference_type;
@@ -78,7 +78,7 @@
_GLIBCXX_JOB_VOLATILE difference_type load;
};
- /** @brief Work stealing algorithm for random access iterators.
+/** @brief Work stealing algorithm for random access iterators.
*
* Uses O(1) additional memory. Synchronization at job lists is
* done with atomic operations.
@@ -96,13 +96,20 @@
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
- template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<
+ typename RandomAccessIterator,
+ typename Op,
+ typename Fu,
+ typename Red,
+ typename Result>
Op
- for_each_template_random_access_workstealing(RandomAccessIterator begin,
+ for_each_template_random_access_workstealing(
+ RandomAccessIterator begin,
RandomAccessIterator end,
Op op, Fu& f, Red r,
Result base, Result& output,
- typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+ typename std::iterator_traits<RandomAccessIterator>::difference_type
+ bound)
{
_GLIBCXX_CALL(end - begin)
@@ -110,34 +117,43 @@
typedef typename traits_type::difference_type difference_type;
- difference_type chunk_size = static_cast<difference_type>(Settings::workstealing_chunk_size);
+ difference_type chunk_size =
+ static_cast<difference_type>(Settings::workstealing_chunk_size);
// How many jobs?
difference_type length = (bound < 0) ? (end - begin) : bound;
// To avoid false sharing in a cache line.
- const int stride = Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
+ const int stride =
+ Settings::cache_line_size * 10 / sizeof(Job<difference_type>) + 1;
// Total number of threads currently working.
thread_index_t busy = 0;
- thread_index_t num_threads = get_max_threads();
- difference_type num_threads_min = num_threads < end - begin ? num_threads : end - begin;
+ Job<difference_type> *job;
+
omp_lock_t output_lock;
omp_init_lock(&output_lock);
+ // Write base value to output.
+ output = base;
+
// No more threads than jobs, at least one thread.
- difference_type num_threads_max = num_threads_min > 1 ? num_threads_min : 1;
- num_threads = static_cast<thread_index_t>(num_threads_max);
+ thread_index_t num_threads =
+ __gnu_parallel::max<thread_index_t>(1,
+ __gnu_parallel::min<difference_type>(length, get_max_threads()));
+# pragma omp parallel shared(busy) num_threads(num_threads)
+ {
+
+# pragma omp single
+ {
+ num_threads = omp_get_num_threads();
+
// Create job description array.
- Job<difference_type> *job = new Job<difference_type>[num_threads * stride];
+ job = new Job<difference_type>[num_threads * stride];
+ }
- // Write base value to output.
- output = base;
-
-#pragma omp parallel shared(busy) num_threads(num_threads)
- {
// Initialization phase.
// Flags for every thread if it is doing productive work.
@@ -158,19 +174,22 @@
// Number of elements to steal in one attempt.
difference_type steal;
- // Every thread has its own random number generator (modulo num_threads).
+ // Every thread has its own random number generator
+ // (modulo num_threads).
random_number rand_gen(iam, num_threads);
-#pragma omp atomic
// This thread is currently working.
+# pragma omp atomic
busy++;
iam_working = true;
// How many jobs per thread? last thread gets the rest.
- my_job.first = static_cast<difference_type>(iam * (length / num_threads));
+ my_job.first =
+ static_cast<difference_type>(iam * (length / num_threads));
- my_job.last = (iam == (num_threads - 1)) ? (length - 1) : ((iam + 1) * (length / num_threads) - 1);
+ my_job.last = (iam == (num_threads - 1)) ?
+ (length - 1) : ((iam + 1) * (length / num_threads) - 1);
my_job.load = my_job.last - my_job.first + 1;
// Init result with first value (to have a base value for reduction).
@@ -185,26 +204,29 @@
RandomAccessIterator current;
-#pragma omp barrier
+# pragma omp barrier
// Actual work phase
// Work on own or stolen start
while (busy > 0)
{
// Work until no productive thread left.
-#pragma omp flush(busy)
+# pragma omp flush(busy)
// Thread has own work to do
while (my_job.first <= my_job.last)
{
// fetch-and-add call
// Reserve current job block (size chunk_size) in my queue.
- difference_type current_job = fetch_and_add<difference_type>(&(my_job.first), chunk_size);
+ difference_type current_job =
+ fetch_and_add<difference_type>(&(my_job.first), chunk_size);
// Update load, to make the three values consistent,
// first might have been changed in the meantime
my_job.load = my_job.last - my_job.first + 1;
- for (difference_type job_counter = 0; job_counter < chunk_size && current_job <= my_job.last; job_counter++)
+ for (difference_type job_counter = 0;
+ job_counter < chunk_size && current_job <= my_job.last;
+ job_counter++)
{
// Yes: process it!
current = begin + current_job;
@@ -214,15 +236,14 @@
result = r(result, f(op, current));
}
-#pragma omp flush(busy)
-
+# pragma omp flush(busy)
}
// After reaching this point, a thread's job list is empty.
if (iam_working)
{
-#pragma omp atomic
// This thread no longer has work.
+# pragma omp atomic
busy--;
iam_working = false;
@@ -231,16 +252,17 @@
difference_type supposed_first, supposed_last, supposed_load;
do
{
- // Find random nonempty deque (not own) and do consistency check.
+ // Find random nonempty deque (not own), do consistency check.
yield();
-#pragma omp flush(busy)
+# pragma omp flush(busy)
victim = rand_gen();
supposed_first = job[victim * stride].first;
supposed_last = job[victim * stride].last;
supposed_load = job[victim * stride].load;
}
while (busy > 0
- && ((supposed_load <= 0) || ((supposed_first + supposed_load - 1) != supposed_last)));
+ && ((supposed_load <= 0)
+ || ((supposed_first + supposed_load - 1) != supposed_last)));
if (busy == 0)
break;
@@ -251,40 +273,30 @@
// Number of elements to steal (at least one).
steal = (supposed_load < 2) ? 1 : supposed_load / 2;
- // Protects against stealing threads
- // omp_set_lock(&(job[victim * stride].lock));
-
// Push victim's start forward.
- difference_type stolen_first = fetch_and_add<difference_type>(&(job[victim * stride].first), steal);
- difference_type stolen_try = stolen_first + steal - difference_type(1);
+ difference_type stolen_first =
+ fetch_and_add<difference_type>(
+ &(job[victim * stride].first), steal);
+ difference_type stolen_try =
+ stolen_first + steal - difference_type(1);
- // Protects against working thread
- // omp_unset_lock(&(job[victim * stride].lock));
-
my_job.first = stolen_first;
-
- // Avoid std::min dependencies.
- my_job.last = stolen_try < supposed_last ? stolen_try : supposed_last;
-
+ my_job.last = __gnu_parallel::min(stolen_try, supposed_last);
my_job.load = my_job.last - my_job.first + 1;
- //omp_unset_lock(&(my_job.lock));
-
-#pragma omp atomic
// Has potential work again.
+# pragma omp atomic
busy++;
iam_working = true;
-#pragma omp flush(busy)
+# pragma omp flush(busy)
}
-#pragma omp flush(busy)
+# pragma omp flush(busy)
} // end while busy > 0
// Add accumulated result to output.
omp_set_lock(&output_lock);
output = r(output, result);
omp_unset_lock(&output_lock);
-
- //omp_destroy_lock(&(my_job.lock));
}
delete[] job;
Index: include/parallel/base.h
===================================================================
--- include/parallel/base.h (revision 130225)
+++ include/parallel/base.h (working copy)
@@ -49,12 +49,12 @@
// XXX remove std::duplicates from here if possible,
// XXX but keep minimal dependencies.
- /** @brief Calculates the rounded-down logarithm of @c n for base 2.
+/** @brief Calculates the rounded-down logarithm of @c n for base 2.
* @param n Argument.
* @return Returns 0 for argument 0.
*/
- template<typename Size>
- inline Size
+template<typename Size>
+ inline Size
log2(Size n)
{
Size k;
@@ -63,21 +63,21 @@
return k;
}
- /** @brief Encode two integers into one __gnu_parallel::lcas_t.
+/** @brief Encode two integers into one __gnu_parallel::lcas_t.
* @param a First integer, to be encoded in the most-significant @c
* lcas_t_bits/2 bits.
* @param b Second integer, to be encoded in the least-significant
* @c lcas_t_bits/2 bits.
* @return __gnu_parallel::lcas_t value encoding @c a and @c b.
- * @see decode2
+ * @see decode2
*/
- inline lcas_t
- encode2(int a, int b) //must all be non-negative, actually
- {
+inline lcas_t
+encode2(int a, int b) //must all be non-negative, actually
+{
return (((lcas_t)a) << (lcas_t_bits / 2)) | (((lcas_t)b) << 0);
- }
+}
- /** @brief Decode two integers from one __gnu_parallel::lcas_t.
+/** @brief Decode two integers from one __gnu_parallel::lcas_t.
* @param x __gnu_parallel::lcas_t to decode integers from.
* @param a First integer, to be decoded from the most-significant
* @c lcas_t_bits/2 bits of @c x.
@@ -85,18 +85,34 @@
* @c lcas_t_bits/2 bits of @c x.
* @see encode2
*/
- inline void
- decode2(lcas_t x, int& a, int& b)
- {
+inline void
+decode2(lcas_t x, int& a, int& b)
+{
a = (int)((x >> (lcas_t_bits / 2)) & lcas_t_mask);
b = (int)((x >> 0 ) & lcas_t_mask);
- }
+}
- /** @brief Constructs predicate for equality from strict weak
+/** @brief Equivalent to std::min. */
+template<typename T>
+ const T&
+ min(const T& a, const T& b)
+ {
+ return (a < b) ? a : b;
+ };
+
+/** @brief Equivalent to std::max. */
+template<typename T>
+ const T&
+ max(const T& a, const T& b)
+ {
+ return (a > b) ? a : b;
+ };
+
+/** @brief Constructs predicate for equality from strict weak
* ordering predicate
*/
- // XXX comparator at the end, as per others
- template<typename Comparator, typename T1, typename T2>
+// XXX comparator at the end, as per others
+template<typename Comparator, typename T1, typename T2>
class equal_from_less : public std::binary_function<T1, T2, bool>
{
private:
@@ -112,8 +128,9 @@
};
- /** @brief Similar to std::binder1st, but giving the argument types explicitly. */
- template<typename _Predicate, typename argument_type>
+/** @brief Similar to std::binder1st,
+ * but giving the argument types explicitly. */
+template<typename _Predicate, typename argument_type>
class unary_negate
: public std::unary_function<argument_type, bool>
{
@@ -129,8 +146,13 @@
{ return !_M_pred(__x); }
};
- /** @brief Similar to std::binder1st, but giving the argument types explicitly. */
- template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
+/** @brief Similar to std::binder1st,
+ * but giving the argument types explicitly. */
+template<
+ typename _Operation,
+ typename first_argument_type,
+ typename second_argument_type,
+ typename result_type>
class binder1st
: public std::unary_function<second_argument_type, result_type>
{
@@ -154,11 +176,15 @@
{ return op(value, __x); }
};
- /**
+/**
* @brief Similar to std::binder2nd, but giving the argument types
- * explicitly.
+ * explicitly.
*/
- template<typename _Operation, typename first_argument_type, typename second_argument_type, typename result_type>
+template<
+ typename _Operation,
+ typename first_argument_type,
+ typename second_argument_type,
+ typename result_type>
class binder2nd
: public std::unary_function<first_argument_type, result_type>
{
@@ -182,30 +208,30 @@
{ return op(__x, value); }
};
- /** @brief Similar to std::equal_to, but allows two different types. */
- template<typename T1, typename T2>
+/** @brief Similar to std::equal_to, but allows two different types. */
+template<typename T1, typename T2>
struct equal_to : std::binary_function<T1, T2, bool>
{
bool operator()(const T1& t1, const T2& t2) const
{ return t1 == t2; }
};
- /** @brief Similar to std::less, but allows two different types. */
- template<typename T1, typename T2>
+/** @brief Similar to std::less, but allows two different types. */
+template<typename T1, typename T2>
struct less : std::binary_function<T1, T2, bool>
{
- bool
+ bool
operator()(const T1& t1, const T2& t2) const
{ return t1 < t2; }
- bool
+ bool
operator()(const T2& t2, const T1& t1) const
{ return t2 < t1; }
};
- // Partial specialization for one type. Same as std::less.
- template<typename _Tp>
- struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
+// Partial specialization for one type. Same as std::less.
+template<typename _Tp>
+struct less<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, bool>
{
bool
operator()(const _Tp& __x, const _Tp& __y) const
@@ -214,21 +240,23 @@
/** @brief Similar to std::plus, but allows two different types. */
- template<typename _Tp1, typename _Tp2>
+template<typename _Tp1, typename _Tp2>
struct plus : public std::binary_function<_Tp1, _Tp2, _Tp1>
{
- typedef typeof(*static_cast<_Tp1*>(NULL) + *static_cast<_Tp2*>(NULL)) result;
+ typedef typeof(*static_cast<_Tp1*>(NULL)
+ + *static_cast<_Tp2*>(NULL)) result;
result
operator()(const _Tp1& __x, const _Tp2& __y) const
{ return __x + __y; }
};
- // Partial specialization for one type. Same as std::plus.
- template<typename _Tp>
+// Partial specialization for one type. Same as std::plus.
+template<typename _Tp>
struct plus<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
{
- typedef typeof(*static_cast<_Tp*>(NULL) + *static_cast<_Tp*>(NULL)) result;
+ typedef typeof(*static_cast<_Tp*>(NULL)
+ + *static_cast<_Tp*>(NULL)) result;
result
operator()(const _Tp& __x, const _Tp& __y) const
@@ -236,22 +264,24 @@
};
- /** @brief Similar to std::multiplies, but allows two different types. */
- template<typename _Tp1, typename _Tp2>
+/** @brief Similar to std::multiplies, but allows two different types. */
+template<typename _Tp1, typename _Tp2>
struct multiplies : public std::binary_function<_Tp1, _Tp2, _Tp1>
{
- typedef typeof(*static_cast<_Tp1*>(NULL) * *static_cast<_Tp2*>(NULL)) result;
+ typedef typeof(*static_cast<_Tp1*>(NULL)
+ * *static_cast<_Tp2*>(NULL)) result;
result
operator()(const _Tp1& __x, const _Tp2& __y) const
{ return __x * __y; }
};
- // Partial specialization for one type. Same as std::multiplies.
- template<typename _Tp>
+// Partial specialization for one type. Same as std::multiplies.
+template<typename _Tp>
struct multiplies<_Tp, _Tp> : public std::binary_function<_Tp, _Tp, _Tp>
{
- typedef typeof(*static_cast<_Tp*>(NULL) * *static_cast<_Tp*>(NULL)) result;
+ typedef typeof(*static_cast<_Tp*>(NULL)
+ * *static_cast<_Tp*>(NULL)) result;
result
operator()(const _Tp& __x, const _Tp& __y) const
@@ -259,15 +289,15 @@
};
- template<typename T, typename _DifferenceTp>
+template<typename T, typename _DifferenceTp>
class pseudo_sequence;
- /** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
+/** @brief Iterator associated with __gnu_parallel::pseudo_sequence.
* If features the usual random-access iterator functionality.
* @param T Sequence value type.
- * @param difference_type Sequence difference type.
+ * @param difference_type Sequence difference type.
*/
- template<typename T, typename _DifferenceTp>
+template<typename T, typename _DifferenceTp>
class pseudo_sequence_iterator
{
public:
@@ -296,34 +326,34 @@
operator++(int)
{ return type(pos++); }
- const T&
+ const T&
operator*() const
{ return val; }
- const T&
+ const T&
operator[](difference_type) const
{ return val; }
- bool
+ bool
operator==(const type& i2)
{ return pos == i2.pos; }
- difference_type
+ difference_type
operator!=(const type& i2)
{ return pos != i2.pos; }
- difference_type
+ difference_type
operator-(const type& i2)
{ return pos - i2.pos; }
};
- /** @brief Sequence that conceptually consists of multiple copies of
+/** @brief Sequence that conceptually consists of multiple copies of
the same element.
* The copies are not stored explicitly, of course.
* @param T Sequence value type.
- * @param difference_type Sequence difference type.
+ * @param difference_type Sequence difference type.
*/
- template<typename T, typename _DifferenceTp>
+template<typename T, typename _DifferenceTp>
class pseudo_sequence
{
typedef pseudo_sequence<T, _DifferenceTp> type;
@@ -338,7 +368,7 @@
* @param val Element of the sequence.
* @param count Number of (virtual) copies.
*/
- pseudo_sequence(const T& val, difference_type count)
+ pseudo_sequence(const T& val, difference_type count)
: val(val), count(count) { }
/** @brief Begin iterator. */
@@ -356,24 +386,24 @@
difference_type count;
};
- /** @brief Functor that does nothing */
- template<typename _ValueTp>
+/** @brief Functor that does nothing */
+template<typename _ValueTp>
class void_functor
{
- inline void
+ inline void
operator()(const _ValueTp& v) const { }
};
- /** @brief Compute the median of three referenced elements,
+/** @brief Compute the median of three referenced elements,
according to @c comp.
* @param a First iterator.
* @param b Second iterator.
* @param c Third iterator.
- * @param comp Comparator.
+ * @param comp Comparator.
*/
- template<typename RandomAccessIterator, typename Comparator>
- RandomAccessIterator
- median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b,
+template<typename RandomAccessIterator, typename Comparator>
+RandomAccessIterator
+ median_of_three_iterators(RandomAccessIterator a, RandomAccessIterator b,
RandomAccessIterator c, Comparator& comp)
{
if (comp(*a, *b))
@@ -397,26 +427,25 @@
}
}
- // Avoid the use of assert, because we're trying to keep the <cassert>
- // include out of the mix. (Same as debug mode).
- inline void
- __replacement_assert(const char* __file, int __line,
+// Avoid the use of assert, because we're trying to keep the <cassert>
+// include out of the mix. (Same as debug mode).
+inline void
+__replacement_assert(const char* __file, int __line,
const char* __function, const char* __condition)
- {
+{
std::printf("%s:%d: %s: Assertion '%s' failed.\n", __file, __line,
__function, __condition);
__builtin_abort();
- }
-
+}
+
#define _GLIBCXX_PARALLEL_ASSERT(_Condition) \
- do \
+do \
{ \
if (!(_Condition)) \
__gnu_parallel::__replacement_assert(__FILE__, __LINE__, \
__PRETTY_FUNCTION__, #_Condition); \
} while (false)
-
+
} //namespace __gnu_parallel
#endif
-
Index: include/parallel/par_loop.h
===================================================================
--- include/parallel/par_loop.h (revision 130225)
+++ include/parallel/par_loop.h (working copy)
@@ -41,11 +41,12 @@
#include <omp.h>
#include <parallel/settings.h>
+#include <parallel/base.h>
namespace __gnu_parallel
{
- /** @brief Embarrassingly parallel algorithm for random access
+/** @brief Embarrassingly parallel algorithm for random access
* iterators, using hand-crafted parallelization by equal splitting
* the work.
*
@@ -63,47 +64,57 @@
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
- template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<
+ typename RandomAccessIterator,
+ typename Op,
+ typename Fu,
+ typename Red,
+ typename Result>
Op
- for_each_template_random_access_ed(RandomAccessIterator begin,
- RandomAccessIterator end, Op o, Fu& f,
- Red r, Result base, Result& output,
- typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+ for_each_template_random_access_ed(
+ RandomAccessIterator begin,
+ RandomAccessIterator end,
+ Op o, Fu& f, Red r, Result base, Result& output,
+ typename std::iterator_traits<RandomAccessIterator>::
+ difference_type bound)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::difference_type difference_type;
const difference_type length = end - begin;
- const difference_type settings_threads = static_cast<difference_type>(get_max_threads());
- const difference_type dmin = settings_threads < length ? settings_threads : length;
- const difference_type dmax = dmin > 1 ? dmin : 1;
+ Result *thread_results;
- thread_index_t num_threads = static_cast<thread_index_t>(dmax);
+ thread_index_t num_threads =
+ __gnu_parallel::min<difference_type>(get_max_threads(), length);
+# pragma omp parallel num_threads(num_threads)
+ {
+# pragma omp single
+ {
+ num_threads = omp_get_num_threads();
+ thread_results = new Result[num_threads];
+ }
- Result *thread_results = new Result[num_threads];
+ thread_index_t iam = omp_get_thread_num();
-#pragma omp parallel num_threads(num_threads)
- {
// Neutral element.
Result reduct = Result();
- thread_index_t p = num_threads;
- thread_index_t iam = omp_get_thread_num();
- difference_type start = iam * length / p;
- difference_type limit = (iam == p - 1) ? length : (iam + 1) * length / p;
+ difference_type
+ start = equally_split_point(length, num_threads, iam),
+ stop = equally_split_point(length, num_threads, iam + 1);
- if (start < limit)
+ if (start < stop)
{
reduct = f(o, begin + start);
- start++;
+ ++start;
}
- for (; start < limit; start++)
+ for (; start < stop; ++start)
reduct = r(reduct, f(o, begin + start));
thread_results[iam] = reduct;
- }
+ } //parallel
for (thread_index_t i = 0; i < num_threads; i++)
output = r(output, thread_results[i]);
Index: include/parallel/features.h
===================================================================
--- include/parallel/features.h (revision 130225)
+++ include/parallel/features.h (working copy)
@@ -66,7 +66,7 @@
* @brief Include guarded (sequences may run empty) loser tree,
* moving objects.
* @see __gnu_parallel::Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE 0
+#define _GLIBCXX_LOSER_TREE 1
#endif
#ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
Index: include/parallel/quicksort.h
===================================================================
--- include/parallel/quicksort.h (revision 130225)
+++ include/parallel/quicksort.h (working copy)
@@ -53,11 +53,17 @@
* this part.
*/
template<typename RandomAccessIterator, typename Comparator>
- inline typename std::iterator_traits<RandomAccessIterator>::difference_type
- parallel_sort_qs_divide(RandomAccessIterator begin, RandomAccessIterator end,
+ inline
+ typename std::iterator_traits<RandomAccessIterator>::difference_type
+ parallel_sort_qs_divide(
+ RandomAccessIterator begin,
+ RandomAccessIterator end,
Comparator comp,
- typename std::iterator_traits<RandomAccessIterator>::difference_type pivot_rank,
- typename std::iterator_traits<RandomAccessIterator>::difference_type num_samples, thread_index_t num_threads)
+ typename std::iterator_traits<RandomAccessIterator>::difference_type
+ pivot_rank,
+ typename std::iterator_traits<RandomAccessIterator>::difference_type
+ num_samples,
+ thread_index_t num_threads)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
@@ -65,20 +71,24 @@
difference_type n = end - begin;
num_samples = std::min(num_samples, n);
- value_type* samples = static_cast<value_type*>(__builtin_alloca(sizeof(value_type) * num_samples));
+ // Allocate uninitialized, to avoid default constructor.
+ value_type* samples = static_cast<value_type*>(
+ operator new(num_samples * sizeof(value_type)));
+
for (difference_type s = 0; s < num_samples; s++)
{
- const unsigned long long index = static_cast<unsigned long long>(s)
+ const unsigned long long index = static_cast<unsigned long long>(s)
* n / num_samples;
- samples[s] = begin[index];
+ new(samples + s) value_type(begin[index]);
}
__gnu_sequential::sort(samples, samples + num_samples, comp);
value_type& pivot = samples[pivot_rank * num_samples / n];
- __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> pred(comp, pivot);
+ __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool>
+ pred(comp, pivot);
difference_type split = parallel_partition(begin, end, pred, num_threads);
return split;
@@ -93,7 +103,10 @@
*/
template<typename RandomAccessIterator, typename Comparator>
inline void
- parallel_sort_qs_conquer(RandomAccessIterator begin, RandomAccessIterator end, Comparator comp, int num_threads)
+ parallel_sort_qs_conquer(RandomAccessIterator begin,
+ RandomAccessIterator end,
+ Comparator comp,
+ thread_index_t num_threads)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
@@ -110,24 +123,27 @@
if (n <= 1)
return;
- thread_index_t num_processors_left;
+ thread_index_t num_threads_left;
if ((num_threads % 2) == 1)
- num_processors_left = num_threads / 2 + 1;
+ num_threads_left = num_threads / 2 + 1;
else
- num_processors_left = num_threads / 2;
+ num_threads_left = num_threads / 2;
- pivot_rank = n * num_processors_left / num_threads;
+ pivot_rank = n * num_threads_left / num_threads;
- difference_type split = parallel_sort_qs_divide(begin, end, comp, pivot_rank,
-Settings::sort_qs_num_samples_preset, num_threads);
+ difference_type split = parallel_sort_qs_divide(
+ begin, end, comp, pivot_rank,
+ Settings::sort_qs_num_samples_preset, num_threads);
#pragma omp parallel sections
{
#pragma omp section
- parallel_sort_qs_conquer(begin, begin + split, comp, num_processors_left);
+ parallel_sort_qs_conquer(begin, begin + split,
+ comp, num_threads_left);
#pragma omp section
- parallel_sort_qs_conquer(begin + split, end, comp, num_threads - num_processors_left);
+ parallel_sort_qs_conquer(begin + split, end,
+ comp, num_threads - num_threads_left);
}
}
@@ -143,9 +159,12 @@
*/
template<typename RandomAccessIterator, typename Comparator>
inline void
- parallel_sort_qs(RandomAccessIterator begin, RandomAccessIterator end,
+ parallel_sort_qs(
+ RandomAccessIterator begin,
+ RandomAccessIterator end,
Comparator comp,
- typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads)
+ typename std::iterator_traits<RandomAccessIterator>::difference_type n,
+ int num_threads)
{
_GLIBCXX_CALL(n)
@@ -165,10 +184,7 @@
// Hard to avoid.
omp_set_num_threads(num_threads);
- bool old_nested = (omp_get_nested() != 0);
- omp_set_nested(true);
parallel_sort_qs_conquer(begin, begin + n, comp, num_threads);
- omp_set_nested(old_nested);
}
} //namespace __gnu_parallel
Index: include/parallel/compiletime_settings.h
===================================================================
--- include/parallel/compiletime_settings.h (revision 130225)
+++ include/parallel/compiletime_settings.h (working copy)
@@ -39,7 +39,7 @@
#include <cstdio>
/** @brief Determine verbosity level of the parallel mode.
- * Level 1 prints a message each time when entering a parallel-mode function. */
+ * Level 1 prints a message each time a parallel-mode function is entered. */
#define _GLIBCXX_VERBOSE_LEVEL 0
/** @def _GLIBCXX_CALL
@@ -50,27 +50,40 @@
#define _GLIBCXX_CALL(n)
#endif
#if (_GLIBCXX_VERBOSE_LEVEL == 1)
-#define _GLIBCXX_CALL(n) printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
+#define _GLIBCXX_CALL(n) \
+ printf(" %s:\niam = %d, n = %ld, num_threads = %d\n", \
+ __PRETTY_FUNCTION__, omp_get_thread_num(), (n), get_max_threads());
#endif
+#ifndef _GLIBCXX_SCALE_DOWN_FPU
/** @brief Use floating-point scaling instead of modulo for mapping
* random numbers to a range. This can be faster on certain CPUs. */
#define _GLIBCXX_SCALE_DOWN_FPU 0
+#endif
+#ifndef _GLIBCXX_ASSERTIONS
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
* Should be switched on only locally. */
#define _GLIBCXX_ASSERTIONS 0
+#endif
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- * Consider the size of the L1 cache for __gnu_parallel::parallel_random_shuffle(). */
+ * Consider the size of the L1 cache for
+ * __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
+#endif
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
/** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
- * Consider the size of the TLB for __gnu_parallel::parallel_random_shuffle(). */
+ * Consider the size of the TLB for
+ * __gnu_parallel::parallel_random_shuffle(). */
#define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
+#endif
+#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
/** @brief First copy the data, sort it locally, and merge it back
* (0); or copy it back after everything is done (1).
*
* Recommendation: 0 */
#define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
-
+#endif
Index: include/parallel/equally_split.h
===================================================================
--- include/parallel/equally_split.h (revision 130225)
+++ include/parallel/equally_split.h (working copy)
@@ -39,30 +39,58 @@
namespace __gnu_parallel
{
- /** @brief Function to split a sequence into parts of almost equal size.
+/** @brief Function to split a sequence into parts of almost equal size.
*
- * The resulting sequence s of length p+1 contains the splitting
+ * The resulting sequence s of length num_threads+1 contains the splitting
* positions when splitting the range [0,n) into parts of almost
* equal size (plus minus 1). The first entry is 0, the last one
* n. There may result empty parts.
* @param n Number of elements
- * @param p Number of parts
+ * @param num_threads Number of parts
* @param s Splitters
- * @returns End of splitter sequence, i. e. @c s+p+1 */
- template<typename _DifferenceTp, typename OutputIterator>
+ * @returns End of splitter sequence, i. e. @c s+num_threads+1 */
+template<typename difference_type, typename OutputIterator>
OutputIterator
- equally_split(_DifferenceTp n, thread_index_t p, OutputIterator s)
+ equally_split(difference_type n,
+ thread_index_t num_threads,
+ OutputIterator s)
{
- typedef _DifferenceTp difference_type;
- difference_type chunk_length = n / p, split = n % p, start = 0;
- for (int i = 0; i < p; i++)
+ difference_type chunk_length = n / num_threads,
+ num_longer_chunks = n % num_threads,
+ pos = 0;
+ for (thread_index_t i = 0; i < num_threads; ++i)
{
- *s++ = start;
- start += (difference_type(i) < split) ? (chunk_length + 1) : chunk_length;
+ *s++ = pos;
+ pos += (i < num_longer_chunks) ? (chunk_length + 1) : chunk_length;
}
*s++ = n;
return s;
}
+
+
+/** @brief Function to split a sequence into parts of almost equal size.
+ *
+ * Returns the position of the splitting point between
+ * thread number thread_no (included) and
+ * thread number thread_no+1 (excluded).
+ * @param n Number of elements
+ * @param num_threads Number of parts
+ * @returns Splitting point */
+template<typename difference_type>
+ difference_type
+ equally_split_point(difference_type n,
+ thread_index_t num_threads,
+ thread_index_t thread_no)
+ {
+ difference_type chunk_length = n / num_threads,
+ num_longer_chunks = n % num_threads;
+
+ if(thread_no < num_longer_chunks)
+ return thread_no * (chunk_length + 1);
+ else
+ return num_longer_chunks * (chunk_length + 1)
+ + (thread_no - num_longer_chunks) * chunk_length;
+ }
}
#endif
Index: include/parallel/omp_loop_static.h
===================================================================
--- include/parallel/omp_loop_static.h (revision 130225)
+++ include/parallel/omp_loop_static.h (working copy)
@@ -64,39 +64,50 @@
* std::count_n()).
* @return User-supplied functor (that may contain a part of the result).
*/
- template<typename RandomAccessIterator, typename Op, typename Fu, typename Red, typename Result>
+template<typename RandomAccessIterator,
+ typename Op,
+ typename Fu,
+ typename Red,
+ typename Result>
Op
- for_each_template_random_access_omp_loop_static(RandomAccessIterator begin,
+ for_each_template_random_access_omp_loop_static(
+ RandomAccessIterator begin,
RandomAccessIterator end,
- Op o, Fu& f, Red r,
- Result base, Result& output,
- typename std::iterator_traits<RandomAccessIterator>::difference_type bound)
+ Op o, Fu& f, Red r, Result base, Result& output,
+ typename std::iterator_traits<RandomAccessIterator>::
+ difference_type bound)
{
- typedef std::iterator_traits<RandomAccessIterator> traits_type;
- typedef typename traits_type::difference_type difference_type;
+ typedef typename
+ std::iterator_traits<RandomAccessIterator>::difference_type
+ difference_type;
- thread_index_t num_threads = (get_max_threads() < (end - begin)) ? get_max_threads() : (end - begin);
- Result *thread_results = new Result[num_threads];
difference_type length = end - begin;
+ thread_index_t num_threads =
+ std::min<difference_type>(get_max_threads(), length);
- for (thread_index_t i = 0; i < num_threads; i++)
- {
- thread_results[i] = r(thread_results[i], f(o, begin+i));
- }
+ Result *thread_results;
-#pragma omp parallel num_threads(num_threads)
+# pragma omp parallel num_threads(num_threads)
{
-#pragma omp for schedule(static, Settings::workstealing_chunk_size)
- for (difference_type pos = 0; pos < length; pos++)
+# pragma omp single
{
- thread_results[omp_get_thread_num()] = r(thread_results[omp_get_thread_num()], f(o, begin+pos));
+ num_threads = omp_get_num_threads();
+ thread_results = new Result[num_threads];
+
+ for (thread_index_t i = 0; i < num_threads; i++)
+ thread_results[i] = Result();
}
- }
+ thread_index_t iam = omp_get_thread_num();
+
+# pragma omp for schedule(static, Settings::workstealing_chunk_size)
+ for (difference_type pos = 0; pos < length; pos++)
+ thread_results[iam] =
+ r(thread_results[iam], f(o, begin+pos));
+ } //parallel
+
for (thread_index_t i = 0; i < num_threads; i++)
- {
output = r(output, thread_results[i]);
- }
delete [] thread_results;
@@ -106,6 +117,7 @@
return o;
}
+
} // end namespace
#endif
Index: include/parallel/random_shuffle.h
===================================================================
--- include/parallel/random_shuffle.h (revision 130225)
+++ include/parallel/random_shuffle.h (working copy)
@@ -45,16 +45,16 @@
namespace __gnu_parallel
{
- /** @brief Type to hold the index of a bin.
+/** @brief Type to hold the index of a bin.
*
* Since many variables of this type are allocated, it should be
* chosen as small as possible.
*/
- typedef unsigned short bin_index;
+typedef unsigned short bin_index;
- /** @brief Data known to every thread participating in
+/** @brief Data known to every thread participating in
__gnu_parallel::parallel_random_shuffle(). */
- template<typename RandomAccessIterator>
+template<typename RandomAccessIterator>
struct DRandomShufflingGlobalData
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
@@ -90,18 +90,15 @@
: source(_source) { }
};
- /** @brief Local data for a thread participating in
+/** @brief Local data for a thread participating in
__gnu_parallel::parallel_random_shuffle().
*/
- template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
struct DRSSorterPU
{
/** @brief Number of threads participating in total. */
int num_threads;
- /** @brief Number of owning thread. */
- int iam;
-
/** @brief Begin index for bins taken care of by this thread. */
bin_index bins_begin;
@@ -115,29 +112,29 @@
DRandomShufflingGlobalData<RandomAccessIterator>* sd;
};
- /** @brief Generate a random number in @c [0,2^logp).
+/** @brief Generate a random number in @c [0,2^logp).
* @param logp Logarithm (basis 2) of the upper range bound.
* @param rng Random number generator to use.
*/
- template<typename RandomNumberGenerator>
+template<typename RandomNumberGenerator>
inline int
random_number_pow2(int logp, RandomNumberGenerator& rng)
{ return rng.genrand_bits(logp); }
- /** @brief Random shuffle code executed by each thread.
+/** @brief Random shuffle code executed by each thread.
* @param pus Array of thread-local data records. */
- template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void
- parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
+ parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
RandomNumberGenerator>* pus)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
typedef typename traits_type::difference_type difference_type;
- DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()];
+ thread_index_t iam = omp_get_thread_num();
+ DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
- thread_index_t iam = d->iam;
// Indexing: dist[bin][processor]
difference_type length = sd->starts[iam + 1] - sd->starts[iam];
@@ -166,9 +163,9 @@
for (bin_index b = 0; b < sd->num_bins + 1; b++)
sd->dist[b][iam + 1] = dist[b];
-#pragma omp barrier
+# pragma omp barrier
-#pragma omp single
+# pragma omp single
{
// Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
// total number of items in bin s
@@ -178,13 +175,13 @@
sd->dist[s + 1]);
}
-#pragma omp barrier
+# pragma omp barrier
sequence_index_t offset = 0, global_offset = 0;
for (bin_index s = 0; s < d->bins_begin; s++)
global_offset += sd->dist[s + 1][d->num_threads];
-#pragma omp barrier
+# pragma omp barrier
for (bin_index s = d->bins_begin; s < d->bins_end; s++)
{
@@ -193,9 +190,10 @@
offset = sd->dist[s + 1][d->num_threads];
}
- sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * offset));
+ sd->temporaries[iam] = static_cast<value_type*>(
+ ::operator new(sizeof(value_type) * offset));
-#pragma omp barrier
+# pragma omp barrier
// Draw local copies to avoid false sharing.
for (bin_index b = 0; b < sd->num_bins + 1; b++)
@@ -223,23 +221,27 @@
delete[] bin_proc;
delete[] temporaries;
-#pragma omp barrier
+# pragma omp barrier
// Shuffle bins internally.
for (bin_index b = d->bins_begin; b < d->bins_end; b++)
{
- value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
- * end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
+ value_type* begin =
+ sd->temporaries[iam] +
+ ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
+ * end =
+ sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
sequential_random_shuffle(begin, end, rng);
- std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
+ std::copy(begin, end, sd->source + global_offset +
+ ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
}
delete[] sd->temporaries[iam];
}
- /** @brief Round up to the next greater power of 2.
+/** @brief Round up to the next greater power of 2.
* @param x Integer to round up */
- template<typename T>
+template<typename T>
T
round_up_to_pow2(T x)
{
@@ -249,16 +251,21 @@
return (T)1 << (log2(x - 1) + 1);
}
- /** @brief Main parallel random shuffle step.
+/** @brief Main parallel random shuffle step.
* @param begin Begin iterator of sequence.
* @param end End iterator of sequence.
* @param n Length of sequence.
* @param num_threads Number of threads to use.
* @param rng Random number generator to use.
*/
- template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void
- parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng)
+ parallel_random_shuffle_drs(
+ RandomAccessIterator begin,
+ RandomAccessIterator end,
+ typename std::iterator_traits<RandomAccessIterator>::difference_type n,
+ thread_index_t num_threads,
+ RandomNumberGenerator& rng)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
typedef typename traits_type::value_type value_type;
@@ -275,16 +282,17 @@
// Try the L1 cache first.
// Must fit into L1.
- num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
+ num_bins_cache = std::max<difference_type>(
+ 1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2
// Power of 2 and at least one element per bin, at most the TLB size.
- num_bins = std::min(n, (difference_type)num_bins_cache);
+ num_bins = std::min<difference_type>(n, num_bins_cache);
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin.
- num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
+ num_bins = std::min<difference_type>(Settings::TLB_size / 2, num_bins);
#endif
num_bins = round_up_to_pow2(num_bins);
@@ -293,32 +301,41 @@
#endif
// Now try the L2 cache
// Must fit into L2
- num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
+ num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
+ 1, n / (Settings::L2_cache_size / sizeof(value_type))));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2.
- num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
+ num_bins = static_cast<bin_index>(
+ std::min(n, static_cast<difference_type>(num_bins_cache)));
// Power of 2 and at least one element per bin, at most the TLB size.
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin.
- num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
+ num_bins = std::min(
+ static_cast<difference_type>(Settings::TLB_size / 2), num_bins);
#endif
num_bins = round_up_to_pow2(num_bins);
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
}
#endif
- num_threads = std::min((bin_index)num_threads, (bin_index)num_bins);
+ num_threads = std::min<bin_index>(num_threads, num_bins);
if (num_threads <= 1)
return sequential_random_shuffle(begin, end, rng);
DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
+ DRSSorterPU<RandomAccessIterator, random_number >* pus;
+ difference_type* starts;
- DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads];
+# pragma omp parallel num_threads(num_threads)
+ {
+# pragma omp single
+ {
+ pus = new DRSSorterPU<RandomAccessIterator, random_number>
+ [num_threads];
sd.temporaries = new value_type*[num_threads];
- //sd.oracles = new bin_index[n];
sd.dist = new difference_type*[num_bins + 1];
sd.bin_proc = new thread_index_t[num_bins];
for (bin_index b = 0; b < num_bins + 1; b++)
@@ -328,34 +345,36 @@
sd.dist[0][0] = 0;
sd.dist[b][0] = 0;
}
- difference_type* starts = sd.starts = new difference_type[num_threads + 1];
+ starts = sd.starts = new difference_type[num_threads + 1];
int bin_cursor = 0;
sd.num_bins = num_bins;
sd.num_bits = log2(num_bins);
- difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0;
- int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads;
- for (int i = 0; i < num_threads; i++)
+ difference_type chunk_length = n / num_threads,
+ split = n % num_threads, start = 0;
+ difference_type bin_chunk_length = num_bins / num_threads,
+ bin_split = num_bins % num_threads;
+ for (thread_index_t i = 0; i < num_threads; i++)
{
starts[i] = start;
start += (i < split) ? (chunk_length + 1) : chunk_length;
int j = pus[i].bins_begin = bin_cursor;
// Range of bins for this processor.
- bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length;
+ bin_cursor += (i < bin_split) ?
+ (bin_chunk_length + 1) : bin_chunk_length;
pus[i].bins_end = bin_cursor;
for (; j < bin_cursor; j++)
sd.bin_proc[j] = i;
pus[i].num_threads = num_threads;
- pus[i].iam = i;
pus[i].seed = rng(std::numeric_limits<uint32>::max());
pus[i].sd = &sd;
}
starts[num_threads] = start;
-
+ } //single
// Now shuffle in parallel.
-#pragma omp parallel num_threads(num_threads)
parallel_random_shuffle_drs_pu(pus);
+ }
delete[] starts;
delete[] sd.bin_proc;
@@ -367,15 +386,15 @@
delete[] pus;
}
- /** @brief Sequential cache-efficient random shuffle.
+/** @brief Sequential cache-efficient random shuffle.
* @param begin Begin iterator of sequence.
* @param end End iterator of sequence.
* @param rng Random number generator to use.
*/
- template<typename RandomAccessIterator, typename RandomNumberGenerator>
+template<typename RandomAccessIterator, typename RandomNumberGenerator>
inline void
sequential_random_shuffle(RandomAccessIterator begin,
- RandomAccessIterator end,
+ RandomAccessIterator end,
RandomNumberGenerator& rng)
{
typedef std::iterator_traits<RandomAccessIterator> traits_type;
@@ -388,7 +407,9 @@
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
// Try the L1 cache first, must fit into L1.
- num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
+ num_bins_cache =
+ std::max<difference_type>
+ (1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2
@@ -404,16 +425,20 @@
{
#endif
// Now try the L2 cache, must fit into L2.
- num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
+ num_bins_cache =
+ static_cast<bin_index>(std::max<difference_type>(
+ 1, n / (Settings::L2_cache_size / sizeof(value_type))));
num_bins_cache = round_up_to_pow2(num_bins_cache);
// No more buckets than TLB entries, power of 2
// Power of 2 and at least one element per bin, at most the TLB size.
- num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
+ num_bins = static_cast<bin_index>
+ (std::min(n, static_cast<difference_type>(num_bins_cache)));
#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
// 2 TLB entries needed per bin
- num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
+ num_bins =
+ std::min<difference_type>(Settings::TLB_size / 2, num_bins);
#e