This is the mail archive of the
libstdc++@gcc.gnu.org
mailing list for the libstdc++ project.
[patch 1/N] std::regex refactoring - _BracketMatcher
- From: Jonathan Wakely <jwakely at redhat dot com>
- To: Tim Shen <timshen91 at gmail dot com>
- Cc: libstdc++ at gcc dot gnu dot org, gcc-patches at gcc dot gnu dot org
- Date: Mon, 28 Apr 2014 13:40:03 +0100
- Subject: [patch 1/N] std::regex refactoring - _BracketMatcher
- Authentication-results: sourceware.org; auth=none
Hi,
I've been looking through the regex code and have a few ideas for
simplifications or optimisations that I'd like to share.
This first patch is for _BracketMatcher. We only use std::bitset when
is_same<_CharT, char> so 8 * sizeof(_CharT) should be __CHAR_BIT__
instead. We also only user _UnsignedCharT when is_same<_CharT, char>
so that can just be simplified to unsigned char.
The contents of _BracketMatcher::_M_char_set are not sorted and can
contain duplicates in the current code. Making that a sorted, unique
list in _BracketMatcher::_M_ready() allows a binary search instead of
linear search. This improves worst case performance for pathological
regular expressions like std::wregex('['+std::wstring(1000, 'a')+"b]")
but I'm not sure if it helps in the common case.
Finally, in the non-char case the _CacheT member is an unused empty
object, so having that as the first member requires 7 bytes of
padding. Re-ordering the members reduces the size of a non-char
_BracketMatcher by 8 bytes (but it's still a whopping 96 bytes).
(For a char _BracketMatcher the bitset cache makes it 128 bytes,
this patch doesn't change that).
Thoughts?
diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h
index f5a198f..a9dd8d3 100644
--- a/libstdc++-v3/include/bits/regex_compiler.h
+++ b/libstdc++-v3/include/bits/regex_compiler.h
@@ -396,6 +396,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
void
_M_ready()
{
+ std::sort(_M_char_set.begin(), _M_char_set.end());
+ auto __end = std::unique(_M_char_set.begin(), _M_char_set.end());
+ _M_char_set.erase(__end, _M_char_set.end());
_M_make_cache(_IsChar());
#ifdef _GLIBCXX_DEBUG
_M_is_ready = true;
@@ -405,10 +408,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
private:
typedef typename is_same<_CharT, char>::type _IsChar;
struct _Dummy { };
+ static constexpr size_t _S_cache_size() { return 1ul << __CHAR_BIT__; }
typedef typename conditional<_IsChar::value,
- std::bitset<1 << (8 * sizeof(_CharT))>,
+ std::bitset<_S_cache_size()>,
_Dummy>::type _CacheT;
- typedef typename make_unsigned<_CharT>::type _UnsignedCharT;
private:
bool
@@ -416,14 +419,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
bool
_M_apply(_CharT __ch, true_type) const
- { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; }
+ { return _M_cache[static_cast<unsigned char>(__ch)]; }
void
_M_make_cache(true_type)
{
- for (int __i = 0; __i < _M_cache.size(); __i++)
- _M_cache[static_cast<_UnsignedCharT>(__i)] =
- _M_apply(__i, false_type());
+ for (unsigned __i = 0; __i < _S_cache_size(); __i++)
+ _M_cache[__i] = _M_apply(static_cast<char>(__i), false_type());
}
void
@@ -431,13 +433,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{ }
private:
- _CacheT _M_cache;
std::vector<_CharT> _M_char_set;
std::vector<_StringT> _M_equiv_set;
std::vector<pair<_StrTransT, _StrTransT>> _M_range_set;
_CharClassT _M_class_set;
_TransT _M_translator;
const _TraitsT& _M_traits;
+ _CacheT _M_cache;
bool _M_is_non_matching;
#ifdef _GLIBCXX_DEBUG
bool _M_is_ready;
diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc
index 128dac1..36edfba 100644
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -507,12 +507,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_BracketMatcher<_TraitsT, __icase, __collate>::
_M_apply(_CharT __ch, false_type) const
{
- bool __ret = false;
- if (std::find(_M_char_set.begin(), _M_char_set.end(),
- _M_translator._M_translate(__ch))
- != _M_char_set.end())
- __ret = true;
- else
+ bool __ret = std::binary_search(_M_char_set.begin(), _M_char_set.end(),
+ _M_translator._M_translate(__ch));
+ if (!__ret)
{
auto __s = _M_translator._M_transform(__ch);
for (auto& __it : _M_range_set)