This is the mail archive of the gcc-bugs@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug libstdc++/66414] string::find ten times slower than strstr


https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66414

--- Comment #5 from Jonathan Wakely <redi at gcc dot gnu.org> ---
This patch would implement the requested change:

--- a/libstdc++-v3/include/bits/basic_string.h
+++ b/libstdc++-v3/include/bits/basic_string.h
@@ -2291,9 +2291,28 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
        *  it begins.  If not found, returns npos.
       */
       size_type
-      find(const _CharT* __s, size_type __pos = 0) const
+      find(const _CharT* __s, size_type __pos = 0) const _GLIBCXX_NOEXCEPT
       {
        __glibcxx_requires_string(__s);
+       if (__are_same<_Traits, char_traits<_CharT>>::__value)
+         {
+           if (__are_same<_CharT, char>::__value)
+             {
+               const char* __data = (const char*)data();
+               const char* __p
+                 = __builtin_strstr(__data + __pos, (const char*)__s);
+               return __p ? (__p - __data) : npos;
+             }
+#if _GLIBCXX_USE_WCHAR_T
+           else if (__are_same<_CharT, wchar_t>::__value)
+             {
+               const wchar_t* __data = (const wchar_t*)data();
+               const wchar_t* __p
+                 = __builtin_wcsstr(__data + __pos, (const wchar_t*)__s);
+               return __p ? (__p - __data) : npos;
+             }
+#endif
+         }
        return this->find(__s, __pos, traits_type::length(__s));
       }


But it causes a huge regression in this example, where currently it is hundreds
of times faster than strstr:

#include <string>
#include <cstring>
#include <chrono>
#include <iostream>
#include <cassert>

int main()
{
  std::string haystack(5000, 'a');
  std::string needle = haystack;
  needle.back() = 'b';
  std::size_t pos[1000];

  auto start = std::chrono::high_resolution_clock::now();
  for (auto& p : pos)
    p = haystack.find(needle.c_str());
  auto stop = std::chrono::high_resolution_clock::now();
  std::cout << (stop - start).count() << std::endl;
  for (auto p : pos)
    assert(p == std::string::npos);

  start = std::chrono::high_resolution_clock::now();
  for (auto& p : pos)
  {
    auto ptr = std::strstr(haystack.c_str(), needle.c_str());
    p = ptr ? ptr - haystack.c_str() : std::string::npos;
  }
  stop = std::chrono::high_resolution_clock::now();
  std::cout << (stop - start).count() << std::endl;
  for (auto p : pos)
    assert(p == std::string::npos);
}

This is because the current code uses char_traits::length() (i.e. strlen)
first, so if the needle is longer than the haystack we don't bother looking for
it.

I don't think it's clear that using strstr is an improvement.

The C++17 library provides Boyer-Moore and Boyer-Moore-Horspool searchers, so
that already offers you more control than basic_string::find or strstr.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]