This is the mail archive of the libstdc++@gcc.gnu.org mailing list for the libstdc++ project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Improve the performance of std::uniform_int_distribution (fewer divisions) [Potential patch]


Thank you Jonathan.

Here is a corrected patch where we guard the use of uint128_t with
_GLIBCXX_USE_INT128 and where we
use sizeof instead of std::is_same to detect the 32-bit and 64-bit cases.
If this looks fine, I (or someone else) can
post it as a patch proposal.

Index: trunk/libstdc++-v3/include/bits/uniform_int_dist.h
===================================================================
--- trunk/libstdc++-v3/include/bits/uniform_int_dist.h (revision 275324)
+++ trunk/libstdc++-v3/include/bits/uniform_int_dist.h (working copy)
@@ -33,6 +33,7 @@

 #include <type_traits>
 #include <limits>
+#include <cstdint>

 namespace std _GLIBCXX_VISIBILITY(default)
 {
@@ -242,15 +243,59 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION

  if (__urngrange > __urange)
   {
-    // downscaling
-    const __uctype __uerange = __urange + 1; // __urange can be zero
-    const __uctype __scaling = __urngrange / __uerange;
-    const __uctype __past = __uerange * __scaling;
-    do
-      __ret = __uctype(__urng()) - __urngmin;
-    while (__ret >= __past);
-    __ret /= __scaling;
-  }
+ const __uctype __uerange = __urange + 1; // __urange can be zero
+#if _GLIBCXX_USE_INT128 == 1
+    if(sizeof(__uctype) == sizeof(uint64_t) and
+      (__urngrange == numeric_limits<uint64_t>::max()))
+    {
+      // 64-bit case
+      // reference: Fast Random Integer Generation in an Interval
+      // ACM Transactions on Modeling and Computer Simulation 29 (1), 2019
+      // https://arxiv.org/abs/1805.10941
+      __uint128_t __product = (__uctype(__urng()) - __urngmin) * __uerange;
+      uint64_t __lsb = uint64_t(__product);
+      if(__lsb < __uerange)
+      {
+        uint64_t __threshold = -__uerange % __uerange;
+        while (__lsb < __threshold)
+        {
+          __product = (__uctype(__urng()) - __urngmin) * __uerange;
+          __lsb = uint64_t(__product);
+        }
+      }
+      __ret = __product >> 64;
+    }
+    else
+#endif // _GLIBCXX_USE_INT128 == 1
+    if(sizeof(__uctype) == sizeof(uint32_t)
+      and (__urngrange == numeric_limits<uint32_t>::max()) )
+    {
+      // 32-bit case
+      // reference: Fast Random Integer Generation in an Interval
+      // ACM Transactions on Modeling and Computer Simulation 29 (1), 2019
+      // https://arxiv.org/abs/1805.10941
+      uint64_t __product = (__uctype(__urng()) - __urngmin) * __uerange;
+      uint32_t __lsb = uint32_t(__product);
+      if(__lsb < __uerange) {
+        uint64_t __threshold = -__uerange % __uerange;
+        while (__lsb < __threshold) {
+          __product = (__uctype(__urng()) - __urngmin) * __uerange;
+          __lsb = uint32_t(__product);
+        }
+      }
+      __ret = __product >> 32;
+    }
+    else
+    {
+      // fallback case (2 divisions)
+      const __uctype __scaling = __urngrange / __uerange;
+      const __uctype __past = __uerange * __scaling;
+      do
+        __ret = __uctype(__urng()) - __urngmin;
+      while (__ret >= __past);
+      __ret /= __scaling;
+    }
+  }
  else if (__urngrange < __urange)
   {
     // upscaling



>Even on recent processors, integer division is relatively expensive.
> >The current implementation of  std::uniform_int_distribution typically
> >requires two divisions by invocation:
> >
> >        // downscaling
> >        const __uctype __uerange = __urange + 1; // __urange can be zero
> >        const __uctype __scaling = __urngrange / __uerange;
> >        const __uctype __past = __uerange * __scaling;
> >        do
> >          __ret = __uctype(__urng()) - __urngmin;
> >        while (__ret >= __past);
> >        __ret /= __scaling;
> >
> >We can achieve the same algorithmic result with at most one division, and
> >typically
> >no division at all without requiring more calls to the random number
> >generator.
> >This was recently added to Swift (
> https://github.com/apple/swift/pull/25286)
> >
> >
> >The main challenge is that we need to be able to compute the "full"
> >product. E.g.,
> >given two 64-bit integers, we want the 128-bit result; given two 32-bit
> >integers
> >we want the 64-bit result. This is fast on common processors. The 128-bit
> >product
> >is not natively supported in C/C++ but can be achieved with the
> __uint128_t
> >extension
> >which is widely supported by GCC.
> >
> >For example, if we replace the above code by the following in the case
> where
> >__uctype is a 32-bit type, we get a substantial performance boost. E.g.,
> it
> >can
> >be twice as fast to sort arrays of 1 million elements (e.g., using the
> >following
> >benchmark: https://github.com/lemire/simple_cpp_shuffle_benchmark )
> >
> >
> >      const __uctype __uerange = __urange + 1; // __urange can be zero
> >      uint64_t __product = (__uctype(__urng()) - __urngmin) * __uerange;
> >      uint32_t __lsb = uint32_t(__product);
> >      if(__lsb < __uerange) {
> >        uint64_t __threshold = -__uerange % __uerange;
> >        while (__lsb < __threshold) {
> >          __product = (__uctype(__urng()) - __urngmin) * __uerange;
> >          __lsb = uint32_t(__product);
> >        }
> >      }
> >
> >I include a potential patch that would bring better performance to
> >std::uniform_int_distribution at least in some cases.
> >
> >Reference: Fast Random Integer Generation in an Interval, ACM Transactions
> >on
> >Modeling and Computer Simulation 29 (1), 2019
> >https://arxiv.org/abs/1805.10941
> >
> >
> >
> >
> >Index: trunk/libstdc++-v3/include/bits/uniform_int_dist.h
> >===================================================================
> >--- trunk/libstdc++-v3/include/bits/uniform_int_dist.h (revision 275324)
> >+++ trunk/libstdc++-v3/include/bits/uniform_int_dist.h (working copy)
> >@@ -33,6 +33,7 @@
> >
> > #include <type_traits>
> > #include <limits>
> >+#include <cstdint>
> >
> > namespace std _GLIBCXX_VISIBILITY(default)
> > {
> >@@ -242,14 +243,55 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> >
> >  if (__urngrange > __urange)
> >   {
> >-    // downscaling
> >-    const __uctype __uerange = __urange + 1; // __urange can be zero
> >-    const __uctype __scaling = __urngrange / __uerange;
> >-    const __uctype __past = __uerange * __scaling;
> >-    do
> >-      __ret = __uctype(__urng()) - __urngmin;
> >-    while (__ret >= __past);
> >-    __ret /= __scaling;
> >+ const __uctype __uerange = __urange + 1; // __urange can be zero
> >+ if(std::is_same<__uctype, uint64_t>::value and
>
> What about the case where sizeof(__uctype) == sizeof(uint64_t) but
> they're not the same type? e.g. uint64_t might be unsigned long, but
> we this technique would be equally valid for unsigned long long.
>
> >+  (__urngrange == numeric_limits<uint64_t>::max()) )
> >+  {
> >+ // 64-bit case
> >+ // reference: Fast Random Integer Generation in an Interval
> >+ // ACM Transactions on Modeling and Computer Simulation 29 (1), 2019
> >+ // https://arxiv.org/abs/1805.10941
> >+ __uint128_t __product = (__uctype(__urng()) - __urngmin) * __uerange;
>
> This looks worth pursuing, but we can't just use __uint128_t like
> that, because it isn't supported on all targets. The improved
> algorithm needs to be used conditionally (probably depending on a
> preprocessor macro like _GLIBCXX_USE_INT128).
>
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]