This is the mail archive of the libstdc++@gcc.gnu.org mailing list for the libstdc++ project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [C++ PATCH] Speed up inplace_merge algorithm & fix inefficient logic(PR libstdc++/83938)


On 12/21/18 9:57 PM, Jonathan Wakely wrote:
On 29/10/18 07:06 +0100, François Dumont wrote:
Hi

    Some feedback regarding this patch ?

Sorry this got missed, please resubmit during stage 1.

You haven't CC'd the original patch author (chang jc) to give them a
chance to comment on your proposed changes to the patch.

The attached PDF on PR libstdc++/83938 has extensive discussion of the
performance issue, but I don't see any for your version. Some
performance benchmarks for your version would be helpful.

Here is this patch again.

This time it is much closer to John's original one, I just kept my change on the size of the temporary buffer which doesn't need to be as large as it is currently allocated, especially with John's patch.

The performance tests are showing the good improvements, attached are the before/after. Surprisingly I do not see any change regarding allocated memory, I guess measures are not accurate enough. However working on them also show that current implementation is fragile. If you reduce the allowed allocated memory too much you end up with a stack overflow because of the recursive implementation.

I already have a patch to keep on trying to allocate memory as long as above a given threshold rather than 0 but I am also working on making implementation non-recursive. We'll see if even a buffer of size 1 is better than no buffer at all then.

    PR libstdc++/83938
    * include/bits/stl_algo.h:
    (__merge_adaptive): When buffer too small consider smallest range first
    and cut based on buffer size.
    (__inplace_merge): Take temporary buffer length from smallest range.
    (__stable_sort): Limit temporary buffer length.
    * include/bits/stl_tempbuf.h (get_temporary_buffer): Change function
    to reduce requested buffer length on allocation failure.
    * testsuite/performance/25_algorithms/inplace_merge.cc: New.
    * testsuite/performance/25_algorithms/stable_sort.cc: Rework to allow
    execution with different level of memory.

François

Attachment: after_libstdc++-performance.sum
Description: Text document

Attachment: before_libstdc++-performance.sum
Description: Text document

diff --git a/libstdc++-v3/include/bits/stl_algo.h b/libstdc++-v3/include/bits/stl_algo.h
index 1ba447bcb6e..e36efb515e8 100644
--- a/libstdc++-v3/include/bits/stl_algo.h
+++ b/libstdc++-v3/include/bits/stl_algo.h
@@ -2418,7 +2418,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  _BidirectionalIterator __second_cut = __middle;
 	  _Distance __len11 = 0;
 	  _Distance __len22 = 0;
-	  if (__len1 > __len2)
+	  if (__len1 < __len2)
 	    {
 	      __len11 = __len1 / 2;
 	      std::advance(__first_cut, __len11);
@@ -2439,14 +2439,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 	  _BidirectionalIterator __new_middle
 	    = std::__rotate_adaptive(__first_cut, __middle, __second_cut,
-				     __len1 - __len11, __len22, __buffer,
-				     __buffer_size);
-	  std::__merge_adaptive(__first, __first_cut, __new_middle, __len11,
-				__len22, __buffer, __buffer_size, __comp);
+				     __len1 - __len11, __len22,
+				     __buffer, __buffer_size);
+	  std::__merge_adaptive(__first, __first_cut, __new_middle,
+				__len11, __len22,
+				__buffer, __buffer_size, __comp);
 	  std::__merge_adaptive(__new_middle, __second_cut, __last,
-				__len1 - __len11,
-				__len2 - __len22, __buffer,
-				__buffer_size, __comp);
+				__len1 - __len11, __len2 - __len22,
+				__buffer, __buffer_size, __comp);
 	}
     }
 
@@ -2520,7 +2520,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       const _DistanceType __len2 = std::distance(__middle, __last);
 
       typedef _Temporary_buffer<_BidirectionalIterator, _ValueType> _TmpBuf;
-      _TmpBuf __buf(__first, __len1 + __len2);
+      _TmpBuf __buf(__first, std::min(__len1, __len2));
 
       if (__buf.begin() == 0)
 	std::__merge_without_buffer
@@ -2728,6 +2728,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  std::__merge_sort_with_buffer(__first, __middle, __buffer, __comp);
 	  std::__merge_sort_with_buffer(__middle, __last, __buffer, __comp);
 	}
+
       std::__merge_adaptive(__first, __middle, __last,
 			    _Distance(__middle - __first),
 			    _Distance(__last - __middle),
@@ -4980,8 +4981,11 @@ _GLIBCXX_BEGIN_NAMESPACE_ALGO
       typedef typename iterator_traits<_RandomAccessIterator>::difference_type
 	_DistanceType;
 
+      if (__first == __last)
+	return;
+
       typedef _Temporary_buffer<_RandomAccessIterator, _ValueType> _TmpBuf;
-      _TmpBuf __buf(__first, std::distance(__first, __last));
+      _TmpBuf __buf(__first, (__last - __first + 1) / 2);
 
       if (__buf.begin() == 0)
 	std::__inplace_stable_sort(__first, __last, __comp);
diff --git a/libstdc++-v3/include/bits/stl_tempbuf.h b/libstdc++-v3/include/bits/stl_tempbuf.h
index fa78ee53eb4..b6ad9ee6a46 100644
--- a/libstdc++-v3/include/bits/stl_tempbuf.h
+++ b/libstdc++-v3/include/bits/stl_tempbuf.h
@@ -95,7 +95,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 							std::nothrow));
 	  if (__tmp != 0)
 	    return std::pair<_Tp*, ptrdiff_t>(__tmp, __len);
-	  __len /= 2;
+	  __len = __len == 1 ? 0 : ((__len + 1) / 2);
 	}
       return std::pair<_Tp*, ptrdiff_t>(static_cast<_Tp*>(0), 0);
     }
diff --git a/libstdc++-v3/testsuite/performance/25_algorithms/inplace_merge.cc b/libstdc++-v3/testsuite/performance/25_algorithms/inplace_merge.cc
new file mode 100644
index 00000000000..8522fcc8044
--- /dev/null
+++ b/libstdc++-v3/testsuite/performance/25_algorithms/inplace_merge.cc
@@ -0,0 +1,130 @@
+// Copyright (C) 2019 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#include <vector>
+#include <algorithm>
+
+#include <testsuite_new_operators.h>
+#include <testsuite_performance.h>
+
+const int max_size = 10000000;
+
+void bench(int mem_threshold, int pivot_index,
+	   std::vector<int> revv,
+	   std::vector<int> fwdv,
+	   std::vector<int> rndv)
+{
+  using namespace __gnu_test;
+
+  time_counter time;
+  resource_counter resource;
+
+  std::vector<int>::iterator pivot = revv.begin() + pivot_index;
+  std::stable_sort(revv.begin(), pivot);
+  std::stable_sort(pivot, revv.end());
+
+  set_new_limit(mem_threshold);
+
+  start_counters(time, resource);
+  std::inplace_merge(revv.begin(), pivot, revv.end());
+  stop_counters(time, resource);
+
+  set_new_limit(~size_t(0));
+
+  report_performance(__FILE__, "reverse", time, resource);
+  clear_counters(time, resource);
+
+  pivot = fwdv.begin() + pivot_index;
+  std::stable_sort(fwdv.begin(), pivot);
+  std::stable_sort(pivot, fwdv.end());
+
+  set_new_limit(mem_threshold);
+
+  start_counters(time, resource);
+  std::inplace_merge(fwdv.begin(), pivot, fwdv.end());
+  stop_counters(time, resource);
+
+  set_new_limit(~size_t(0));
+
+  report_performance(__FILE__, "forwards", time, resource);
+  clear_counters(time, resource);
+
+  pivot = rndv.begin() + pivot_index;
+  std::stable_sort(rndv.begin(), pivot);
+  std::stable_sort(pivot, rndv.end());
+
+  set_new_limit(mem_threshold);
+
+  start_counters(time, resource);
+  std::inplace_merge(rndv.begin(), pivot, rndv.end());
+  stop_counters(time, resource);
+
+  set_new_limit(~size_t(0));
+  report_performance(__FILE__, "random", time, resource);
+}
+
+int main()
+{
+  using namespace __gnu_test;
+
+  // No constraint to build vectors.
+  set_new_limit(~size_t(0));
+
+  std::vector<int> revv(max_size);
+  for (int i = 0; i < max_size; ++i)
+    revv[i] = -i;
+
+  std::vector<int> fwdv(max_size);
+  for (int i = 0; i < max_size; ++i)
+    fwdv[i] = i;
+
+  // a simple pseudo-random series which does not rely on rand() and friends
+  std::vector<int> rndv(max_size);
+  rndv[0] = 0;
+  for (int i = 1; i < max_size; ++i)
+    rndv[i] = (rndv[i-1] + 110211473) * 745988807;
+
+  time_counter time;
+  resource_counter resource;
+
+  start_counters(time, resource);
+  // Limit to half the expected size of the sorted array.
+  bench(max_size * sizeof(int) / 2, 10, revv, fwdv, rndv);
+  bench(max_size * sizeof(int) / 2, max_size / 2, revv, fwdv, rndv);
+  stop_counters(time, resource);
+
+  report_performance(__FILE__, "bench 1/2 memory", time, resource);
+  clear_counters(time, resource);
+
+  start_counters(time, resource);
+  // Limit to the fourth.
+  bench(max_size * sizeof(int) / 4, 10, revv, fwdv, rndv);
+  bench(max_size * sizeof(int) / 4, max_size / 2, revv, fwdv, rndv);
+  stop_counters(time, resource);
+
+  report_performance(__FILE__, "bench 1/4 memory", time, resource);
+  clear_counters(time, resource);
+
+  start_counters(time, resource);
+  // Forbid any allocation.
+  bench(0, 10, revv, fwdv, rndv);
+  bench(0, max_size / 2, revv, fwdv, rndv);
+  stop_counters(time, resource);
+
+  report_performance(__FILE__, "bench no memory", time, resource);
+  return 0;
+}
diff --git a/libstdc++-v3/testsuite/performance/25_algorithms/stable_sort.cc b/libstdc++-v3/testsuite/performance/25_algorithms/stable_sort.cc
index e79e0a7f6b2..f997b2a72ce 100644
--- a/libstdc++-v3/testsuite/performance/25_algorithms/stable_sort.cc
+++ b/libstdc++-v3/testsuite/performance/25_algorithms/stable_sort.cc
@@ -17,49 +17,102 @@
 
 #include <vector>
 #include <algorithm>
+
+#include <testsuite_new_operators.h>
 #include <testsuite_performance.h>
 
-int main()
+const int max_size = 10000000;
+
+void bench(size_t mem_threshold,
+	   std::vector<int> revv,
+	   std::vector<int> fwdv,
+	   std::vector<int> rndv)
 {
   using namespace __gnu_test;
 
   time_counter time;
   resource_counter resource;
 
-  const int max_size = 10000000;
-
-  std::vector<int> v(max_size);
-
-  for (int i = 0; i < max_size; ++i)
-    v[i] = -i;
+  set_new_limit(mem_threshold);
 
   start_counters(time, resource);
-  std::stable_sort(v.begin(), v.end());
+  std::stable_sort(revv.begin(), revv.end());
   stop_counters(time, resource);
 
+  set_new_limit(~size_t(0));
   report_performance(__FILE__, "reverse", time, resource);
   clear_counters(time, resource);
 
-  for (int i = 0; i < max_size; ++i)
-    v[i] = i;
+  set_new_limit(mem_threshold);
 
   start_counters(time, resource);
-  std::stable_sort(v.begin(), v.end());
+  std::stable_sort(fwdv.begin(), fwdv.end());
   stop_counters(time, resource);
 
+  set_new_limit(~size_t(0));
   report_performance(__FILE__, "forwards", time, resource);
   clear_counters(time, resource);
 
-  // a simple psuedo-random series which does not rely on rand() and friends
-  v[0] = 0;
+  start_counters(time, resource);
+  std::stable_sort(rndv.begin(), rndv.end());
+  stop_counters(time, resource);
+
+  set_new_limit(~size_t(0));
+  report_performance(__FILE__, "random", time, resource);
+}
+
+int main()
+{
+  using namespace __gnu_test;
+
+  // No memory constraint.
+  set_new_limit(~size_t(0));
+
+  std::vector<int> revv(max_size);
+  for (int i = 0; i < max_size; ++i)
+    revv[i] = -i;
+
+  std::vector<int> fwdv(max_size);
+  for (int i = 0; i < max_size; ++i)
+    fwdv[i] = i;
+
+  // a simple pseudo-random series which does not rely on rand() and friends
+  std::vector<int> rndv(max_size);
+  rndv[0] = 0;
   for (int i = 1; i < max_size; ++i)
-    v[i] = (v[i-1] + 110211473) * 745988807;
+    rndv[i] = (rndv[i-1] + 110211473) * 745988807;
+
+  time_counter time;
+  resource_counter resource;
 
   start_counters(time, resource);
-  std::stable_sort(v.begin(), v.end());
+  bench(~size_t(0), revv, fwdv, rndv);
   stop_counters(time, resource);
 
-  report_performance(__FILE__, "random", time, resource);
+  report_performance(__FILE__, "bench full memory", time, resource);
+  clear_counters(time, resource);
+
+  start_counters(time, resource);
+  // Limit to half the expected size of the sorted array.
+  bench(max_size * sizeof(int) / 2, revv, fwdv, rndv);
+  stop_counters(time, resource);
+
+  report_performance(__FILE__, "bench 1/2 memory", time, resource);
+  clear_counters(time, resource);
+
+  start_counters(time, resource);
+  // Limit to the fourth.
+  bench(max_size * sizeof(int) / 4, revv, fwdv, rndv);
+  stop_counters(time, resource);
+
+  report_performance(__FILE__, "bench 1/4 memory", time, resource);
+  clear_counters(time, resource);
+
+  start_counters(time, resource);
+  // Forbid any allocation.
+  bench(0, revv, fwdv, rndv);
+  stop_counters(time, resource);
 
+  report_performance(__FILE__, "bench no memory", time, resource);
   return 0;
 }

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]