This is the mail archive of the libstdc++@gcc.gnu.org mailing list for the libstdc++ project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Optimize to_chars


We've already beaten this topic to death, so let's put a final nail in
the coffin:


__to_chars_10_impl is quite fast. According to the IACA the main loop
takes only 6.0 cycles, the whole function with one iteration takes
10.0 cycles. Replacing the __first[pos] and __first[pos - 1] with
__first[0] and __first[1] drops the function time to 7.53 cycles.

Changelog:

2019-09-08  Antony Polukhin  <antoshkka@gmail.com>

    * include/bits/charconv.h (__detail::__to_chars_10_impl): Replace
    final offsets with constants.


And that's the only optimization that improves all the usecases and
reduces code size on 3 instructions.

Different approaches for optimizing the loop were showing different
results depending on the workload. The most interesting result gives
the compressed table of binary coded decimals:

static constexpr unsigned char __binary_coded_decimals[50] =
{ 0x00, 0x02, 0x04, 0x06, 0x08,  0x10... 0x98 };
unsigned __pos = __len - 1;
while (__val >= 100)
{
  auto const addition = __val & 1;
  auto const __num = (__val % 100) >> 1;
  __val /= 100;
  auto const __bcd = __binary_coded_decimals[__num];
  __first[__pos] = '0' + (__bcd & 0xf) + addition;
  __first[__pos - 1] = '0' + (__bcd >> 4);
  __pos -= 2;
}

That approach shows the same results or even outperforms the existing
approach with __digits[201] = "0001020304..." in case of cold cache.
It also produces slightly smaller binaries. Unfortunately on a warmed
up cache it's slower than the existing approach. I don't think that
it's a worth change.

Attaching some of the benchmarks as a separate file (not for merge,
just something to experiment with).


-- 
Best regards,
Antony Polukhin

Attachment: to_chars_10_impl_patch.txt
Description: Text document

#include <type_traits>
#include <vector>
#include <benchmark/benchmark.h>


// Adjust those variables to test on different workloads
constexpr unsigned From = 130000;
constexpr unsigned To   = 130004;
constexpr unsigned Len  = 6;
// Bytes of CPU cache to reset
#define RANGES ->Args({16 * 1024 * 1024})



namespace std { namespace __detail {
  template<typename _Tp>
    int
    __to_chars_10_table_2_impl(char* __first, unsigned __len, _Tp __val) noexcept
    {
      static_assert(is_integral<_Tp>::value, "implementation bug");
      static_assert(is_unsigned<_Tp>::value, "implementation bug");

      static constexpr char __digits[201] =
    "0001020304050607080910111213141516171819"
    "2021222324252627282930313233343536373839"
    "4041424344454647484950515253545556575859"
    "6061626364656667686970717273747576777879"
    "8081828384858687888990919293949596979899";
      unsigned __pos = __len - 1;
      while (__val >= 100)
    {
      auto const __num = (__val % 100) * 2;
      __val /= 100;
      __first[__pos] = __digits[__num + 1];
      __first[__pos - 1] = __digits[__num];
      __pos -= 2;
    }
      if (__val >= 10)
    {
      auto const __num = __val * 2;
      __first[1] = __digits[__num + 1];
      __first[0] = __digits[__num];
    }
      else
    __first[0] = '0' + __val;
    return 0;
    }

      template<typename _Tp>
    int
    __to_chars_10_bcd_impl(char* __first, unsigned __len, _Tp __val) noexcept
    {
      static_assert(is_integral<_Tp>::value, "implementation bug");
      static_assert(is_unsigned<_Tp>::value, "implementation bug");

      static constexpr unsigned char __binary_coded_decimals[100] =
    { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10
  , 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20
  , 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x30
  , 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x40
  , 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x50
  , 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x60
  , 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70
  , 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x80
  , 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x90
  , 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99};
      unsigned __pos = __len - 1;
      while (__val >= 100)
    {
      auto const __num = __val % 100;
      __val /= 100;
    auto const __bcd = __binary_coded_decimals[__num];
      __first[__pos] = '0' + (__bcd & 0xf);
      __first[__pos - 1] = '0' + (__bcd >> 4);
      __pos -= 2;
    }
      if (__val >= 10)
    {
    auto const __bcd = __binary_coded_decimals[__val];
      __first[1] = '0' + (__bcd & 0xf);
      __first[0] = '0' + (__bcd >> 4);
    }
      else
    __first[0] = '0' + __val;
    return 0;
    }

      template<typename _Tp>
    int
    __to_chars_10_bcd_compressed_impl(char* __first, unsigned __len, _Tp __val) noexcept
    {
      static_assert(is_integral<_Tp>::value, "implementation bug");
      static_assert(is_unsigned<_Tp>::value, "implementation bug");

      static constexpr unsigned char __binary_coded_decimals[50] =
    { 0x00, 0x02, 0x04, 0x06, 0x08,  0x10
        , 0x12, 0x14, 0x16, 0x18,  0x20
        , 0x22, 0x24, 0x26, 0x28,  0x30
        , 0x32, 0x34, 0x36, 0x38,  0x40
        , 0x42, 0x44, 0x46, 0x48,  0x50
        , 0x52, 0x54, 0x56, 0x58,  0x60
        , 0x62, 0x64, 0x66, 0x68,  0x70
        , 0x72, 0x74, 0x76, 0x78,  0x80
        , 0x82, 0x84, 0x86, 0x88,  0x90
        , 0x92, 0x94, 0x96, 0x98 };
      unsigned __pos = __len - 1;
      while (__val >= 100)
    {
    auto const addition = __val & 1;
      auto const __num = (__val % 100) >> 1;
      __val /= 100;
    auto const __bcd = __binary_coded_decimals[__num];
      __first[__pos] = '0' + (__bcd & 0xf) + addition;
      __first[__pos - 1] = '0' + (__bcd >> 4);
      __pos -= 2;
    }
      if (__val >= 10)
    {
    auto const __bcd = __binary_coded_decimals[__val >> 1];
      __first[1] = '0' + (__bcd & 0xf) + (__val & 1);
      __first[0] = '0' + (__bcd >> 4);
    }
      else
    __first[0] = '0' + __val;
    return 0;
    }

      template<typename _Tp>
    _GLIBCXX14_CONSTEXPR int
    __to_chars_10_naive_impl(char* __first, unsigned __len, _Tp __val) noexcept
    {
      static_assert(is_integral<_Tp>::value, "implementation bug");
      static_assert(is_unsigned<_Tp>::value, "implementation bug");

      unsigned __pos = __len - 1;
      while (__val >= 100)
    {
      auto __num = __val % 10;
      __val /= 10;
      __first[__pos] = '0' + __num;
      __num = __val % 10;
      __val /= 10;
      __first[__pos - 1] = '0' + __num;
      __pos -= 2;
    }
      if (__val >= 10)
    {
      auto const __num = __val % 10;
      __val /= 10;
      __first[1] = '0' + __num;
      __first[0] = '0' + __val;
    }
      else
    __first[0] = '0' + __val;
    return 0;
    }

      template<typename _Tp>
    _GLIBCXX14_CONSTEXPR int
    __to_chars_10_naive_unr_impl(char* __first, unsigned __len, _Tp __val) noexcept
    {
      static_assert(is_integral<_Tp>::value, "implementation bug");
      static_assert(is_unsigned<_Tp>::value, "implementation bug");

      while (__len >= 6)
    {
      unsigned __pos = __len - 1;
      auto __num = __val % 10;
      __val /= 10;
      __first[__pos] = '0' + __num;

      __num = __val % 10;
      __val /= 10;
      __first[__pos - 1] = '0' + __num;

    __num = __val % 10;
      __val /= 10;
      __first[__pos - 2] = '0' + __num;

      __num = __val % 10;
      __val /= 10;
      __first[__pos - 3] = '0' + __num;

    __num = __val % 10;
      __val /= 10;
      __first[__pos - 4] = '0' + __num;

      __num = __val % 10;
      __val /= 10;
      __first[__pos - 5] = '0' + __num;
    __len -= 6;
    }

  switch (__len) {
    case 5:
      __first[4] = '0' + __val % 10;
      __val /= 10;
    case 4:
      __first[3] = '0' + __val % 10;
      __val /= 10;
    case 3:
      __first[2] = '0' + __val % 10;
      __val /= 10;
    case 2:
      __first[1] = '0' + __val % 10;
      __val /= 10;
    case 1:
      __first[0] = '0' + __val;
  }
    return 0;
    }

}}



static void FlushCacheNonpausing(unsigned FlushCacheBaytes) {
    std::vector<unsigned> resetter;
    resetter.resize(FlushCacheBaytes / sizeof(unsigned), FlushCacheBaytes);

    unsigned trash = static_cast<unsigned>(FlushCacheBaytes * FlushCacheBaytes);
    for (unsigned& v : resetter) {
        trash += v;
        v += trash;
    }

    benchmark::DoNotOptimize(trash);
    benchmark::DoNotOptimize(resetter);
}

static void naive(benchmark::State& state) {
    const unsigned FlushCacheBaytes = state.range();
  for (auto _ : state) {
       if (FlushCacheBaytes) {
        state.PauseTiming();
        FlushCacheNonpausing(FlushCacheBaytes);
        state.ResumeTiming();
       }
     for(unsigned i = From; i != To; ++i)
     {
        char buffer[Len];
        benchmark::DoNotOptimize(std::__detail::__to_chars_10_naive_impl(buffer, Len, i));
        benchmark::DoNotOptimize(buffer);
     }

  }
}
BENCHMARK(naive) RANGES;

static void unrolled(benchmark::State& state) {
    const unsigned FlushCacheBaytes = state.range();
  for (auto _ : state) {
       if (FlushCacheBaytes) {
        state.PauseTiming();
        FlushCacheNonpausing(FlushCacheBaytes);
        state.ResumeTiming();
       }
     for(unsigned i = From; i != To; ++i)
     {
        char buffer[Len];
        benchmark::DoNotOptimize(std::__detail::__to_chars_10_naive_unr_impl(buffer, Len, i));
        benchmark::DoNotOptimize(buffer);
     }

  }
}
BENCHMARK(unrolled) RANGES;


static void table_2(benchmark::State& state) {
    const unsigned FlushCacheBaytes = state.range();
  for (auto _ : state) {
       if (FlushCacheBaytes) {
        state.PauseTiming();
        FlushCacheNonpausing(FlushCacheBaytes);
        state.ResumeTiming();
       }
     for(unsigned i = From; i != To; ++i)
     {
        char buffer[Len];
        benchmark::DoNotOptimize(std::__detail::__to_chars_10_table_2_impl(buffer, Len, i));
        benchmark::DoNotOptimize(buffer);
     }

  }
}
BENCHMARK(table_2) RANGES;


static void bcd(benchmark::State& state) {
    const unsigned FlushCacheBaytes = state.range();
  for (auto _ : state) {
       if (FlushCacheBaytes) {
        state.PauseTiming();
        FlushCacheNonpausing(FlushCacheBaytes);
        state.ResumeTiming();
       }
     for(unsigned i = From; i != To; ++i)
     {
        char buffer[Len];
        benchmark::DoNotOptimize(std::__detail::__to_chars_10_bcd_impl(buffer, Len, i));
        benchmark::DoNotOptimize(buffer);
     }

  }
}
BENCHMARK(bcd) RANGES;


static void compressed_bin_coded(benchmark::State& state) {
    const unsigned FlushCacheBaytes = state.range();
  for (auto _ : state) {
       if (FlushCacheBaytes) {
        state.PauseTiming();
        FlushCacheNonpausing(FlushCacheBaytes);
        state.ResumeTiming();
       }
     for(unsigned i = From; i != To; ++i)
     {
        char buffer[Len];
        benchmark::DoNotOptimize(std::__detail::__to_chars_10_bcd_compressed_impl(buffer, Len, i));
        benchmark::DoNotOptimize(buffer);
     }

  }
}
BENCHMARK(compressed_bin_coded) RANGES;

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]