[Patch] Patch set for regex instantiation
Tim Shen
timshen91@gmail.com
Thu Jan 16 23:37:00 GMT 2014
On Mon, Jan 13, 2014 at 5:40 AM, Paolo Carlini <paolo.carlini@oracle.com> wrote:
> .. note that, as a general policy, strchr (and all the other "C" library
> functions for that matter) should be invoked as std::strchr.
Booted, tested and committed, with strchr -> std::strchr
--
Regards,
Tim Shen
-------------- next part --------------
commit 3fca438f8c8db42554e6c79e666127543525fabe
Author: tim <timshen91@gmail.com>
Date: Fri Jan 10 18:06:38 2014 -0500
2014-01-16 Tim Shen <timshen91@gmail.com>
* include/bits/regex_compiler.h: Change _ScannerT into char-type
templated.
* include/bits/regex_scanner.h (_Scanner<>::_Scanner()): Separate
_ScannerBase from _Scanner; Change _Scanner's template argument from
_FwdIter to _CharT. Avoid use of std::map and std::set by using arrays
instead.
* include/bits/regex_scanner.tcc (_Scanner<>::_Scanner(),
_Scanner<>::_M_scan_normal(), _Scanner<>::_M_eat_escape_ecma(),
_Scanner<>::_M_eat_escape_posix(), _Scanner<>::_M_eat_escape_awk()):
Likewise.
* include/std/regex: Add <cstring> for using strchr.
diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h
index b73fe30..e692726 100644
--- a/libstdc++-v3/include/bits/regex_compiler.h
+++ b/libstdc++-v3/include/bits/regex_compiler.h
@@ -59,7 +59,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{ return make_shared<_RegexT>(std::move(_M_nfa)); }
private:
- typedef _Scanner<_FwdIter> _ScannerT;
+ typedef _Scanner<typename _TraitsT::char_type> _ScannerT;
typedef typename _ScannerT::_TokenT _TokenT;
typedef _StateSeq<_TraitsT> _StateSeqT;
typedef std::stack<_StateSeqT, std::vector<_StateSeqT>> _StackT;
diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h
index d113c5d..6dc2b4e 100644
--- a/libstdc++-v3/include/bits/regex_scanner.h
+++ b/libstdc++-v3/include/bits/regex_scanner.h
@@ -39,6 +39,154 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
* @{
*/
+ struct _ScannerBase
+ {
+ public:
+ /// Token types returned from the scanner.
+ enum _TokenT
+ {
+ _S_token_anychar,
+ _S_token_ord_char,
+ _S_token_oct_num,
+ _S_token_hex_num,
+ _S_token_backref,
+ _S_token_subexpr_begin,
+ _S_token_subexpr_no_group_begin,
+ _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n'
+ _S_token_subexpr_end,
+ _S_token_bracket_begin,
+ _S_token_bracket_neg_begin,
+ _S_token_bracket_end,
+ _S_token_interval_begin,
+ _S_token_interval_end,
+ _S_token_quoted_class,
+ _S_token_char_class_name,
+ _S_token_collsymbol,
+ _S_token_equiv_class_name,
+ _S_token_opt,
+ _S_token_or,
+ _S_token_closure0,
+ _S_token_closure1,
+ _S_token_ungreedy,
+ _S_token_line_begin,
+ _S_token_line_end,
+ _S_token_word_bound, // neg if _M_value[0] == 'n'
+ _S_token_comma,
+ _S_token_dup_count,
+ _S_token_eof,
+ _S_token_unknown
+ };
+
+ protected:
+ typedef regex_constants::syntax_option_type _FlagT;
+
+ enum _StateT
+ {
+ _S_state_normal,
+ _S_state_in_brace,
+ _S_state_in_bracket,
+ };
+
+ protected:
+ _ScannerBase(_FlagT __flags)
+ : _M_state(_S_state_normal),
+ _M_flags(__flags),
+ _M_escape_tbl(_M_is_ecma()
+ ? _M_ecma_escape_tbl
+ : _M_awk_escape_tbl),
+ _M_spec_char(_M_is_ecma()
+ ? _M_ecma_spec_char
+ : _M_is_basic()
+ ? _M_basic_spec_char
+ : _M_extended_spec_char),
+ _M_at_bracket_start(false)
+ { }
+
+ protected:
+ const char*
+ _M_find_escape(char __c)
+ {
+ auto __it = _M_escape_tbl;
+ for (; __it->first != '\0'; ++__it)
+ if (__it->first == __c)
+ return &__it->second;
+ return nullptr;
+ }
+
+ bool
+ _M_is_ecma() const
+ { return _M_flags & regex_constants::ECMAScript; }
+
+ bool
+ _M_is_basic() const
+ { return _M_flags & (regex_constants::basic | regex_constants::grep); }
+
+ bool
+ _M_is_extended() const
+ {
+ return _M_flags & (regex_constants::extended
+ | regex_constants::egrep
+ | regex_constants::awk);
+ }
+
+ bool
+ _M_is_grep() const
+ { return _M_flags & (regex_constants::grep | regex_constants::egrep); }
+
+ bool
+ _M_is_awk() const
+ { return _M_flags & regex_constants::awk; }
+
+ protected:
+ const std::pair<char, _TokenT> _M_token_tbl[9] =
+ {
+ {'^', _S_token_line_begin},
+ {'$', _S_token_line_end},
+ {'.', _S_token_anychar},
+ {'*', _S_token_closure0},
+ {'+', _S_token_closure1},
+ {'?', _S_token_opt},
+ {'|', _S_token_or},
+ {'\n', _S_token_or}, // grep and egrep
+ {'\0', _S_token_or},
+ };
+ const std::pair<char, char> _M_ecma_escape_tbl[8] =
+ {
+ {'0', '\0'},
+ {'b', '\b'},
+ {'f', '\f'},
+ {'n', '\n'},
+ {'r', '\r'},
+ {'t', '\t'},
+ {'v', '\v'},
+ {'\0', '\0'},
+ };
+ const std::pair<char, char> _M_awk_escape_tbl[11] =
+ {
+ {'"', '"'},
+ {'/', '/'},
+ {'\\', '\\'},
+ {'a', '\a'},
+ {'b', '\b'},
+ {'f', '\f'},
+ {'n', '\n'},
+ {'r', '\r'},
+ {'t', '\t'},
+ {'v', '\v'},
+ {'\0', '\0'},
+ };
+ const char* _M_ecma_spec_char = "^$\\.*+?()[]{}|";
+ const char* _M_basic_spec_char = ".[\\*^$";
+ const char* _M_extended_spec_char = ".[\\()*+?{|^$";
+
+ _StateT _M_state;
+ _FlagT _M_flags;
+ _TokenT _M_token;
+ const std::pair<char, char>* _M_escape_tbl;
+ const char* _M_spec_char;
+ bool _M_at_bracket_start;
+ };
+
/**
* @brief struct _Scanner. Scans an input range for regex tokens.
*
@@ -49,51 +197,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
* constructor: different regular expression grammars will interpret
* the same input pattern in syntactically different ways.
*/
- template<typename _FwdIter>
+ template<typename _CharT>
class _Scanner
+ : public _ScannerBase
{
public:
- typedef typename std::iterator_traits<_FwdIter>::value_type _CharT;
+ typedef const _CharT* _IterT;
typedef std::basic_string<_CharT> _StringT;
typedef regex_constants::syntax_option_type _FlagT;
typedef const std::ctype<_CharT> _CtypeT;
- /// Token types returned from the scanner.
- enum _TokenT
- {
- _S_token_anychar,
- _S_token_ord_char,
- _S_token_oct_num,
- _S_token_hex_num,
- _S_token_backref,
- _S_token_subexpr_begin,
- _S_token_subexpr_no_group_begin,
- _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n'
- _S_token_subexpr_end,
- _S_token_bracket_begin,
- _S_token_bracket_neg_begin,
- _S_token_bracket_end,
- _S_token_interval_begin,
- _S_token_interval_end,
- _S_token_quoted_class,
- _S_token_char_class_name,
- _S_token_collsymbol,
- _S_token_equiv_class_name,
- _S_token_opt,
- _S_token_or,
- _S_token_closure0,
- _S_token_closure1,
- _S_token_ungreedy,
- _S_token_line_begin,
- _S_token_line_end,
- _S_token_word_bound, // neg if _M_value[0] == 'n'
- _S_token_comma,
- _S_token_dup_count,
- _S_token_eof,
- _S_token_unknown
- };
-
- _Scanner(_FwdIter __begin, _FwdIter __end,
+ _Scanner(_IterT __begin, _IterT __end,
_FlagT __flags, std::locale __loc);
void
@@ -113,13 +227,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
private:
- enum _StateT
- {
- _S_state_normal,
- _S_state_in_brace,
- _S_state_in_bracket,
- };
-
void
_M_scan_normal();
@@ -141,49 +248,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
void
_M_eat_class(char);
- constexpr bool
- _M_is_ecma()
- { return _M_flags & regex_constants::ECMAScript; }
-
- constexpr bool
- _M_is_basic()
- { return _M_flags & (regex_constants::basic | regex_constants::grep); }
-
- constexpr bool
- _M_is_extended()
- {
- return _M_flags & (regex_constants::extended
- | regex_constants::egrep
- | regex_constants::awk);
- }
-
- constexpr bool
- _M_is_grep()
- { return _M_flags & (regex_constants::grep | regex_constants::egrep); }
-
- constexpr bool
- _M_is_awk()
- { return _M_flags & regex_constants::awk; }
-
- _StateT _M_state;
- _FwdIter _M_current;
- _FwdIter _M_end;
- _FlagT _M_flags;
+ _IterT _M_current;
+ _IterT _M_end;
_CtypeT& _M_ctype;
- _TokenT _M_token;
_StringT _M_value;
- bool _M_at_bracket_start;
- public:
- // FIXME: make them static when this file is stable.
- const std::map<char, _TokenT> _M_token_map;
- const std::map<char, char> _M_ecma_escape_map;
- const std::map<char, char> _M_awk_escape_map;
- const std::set<char> _M_ecma_spec_char;
- const std::set<char> _M_basic_spec_char;
- const std::set<char> _M_extended_spec_char;
-
- const std::map<char, char>& _M_escape_map;
- const std::set<char>& _M_spec_char;
void (_Scanner::* _M_eat_escape)();
};
diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc
index 34d78ec..d954d07 100644
--- a/libstdc++-v3/include/bits/regex_scanner.tcc
+++ b/libstdc++-v3/include/bits/regex_scanner.tcc
@@ -52,106 +52,22 @@ namespace __detail
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
- template<typename _FwdIter>
- _Scanner<_FwdIter>::
- _Scanner(_FwdIter __begin, _FwdIter __end,
+ template<typename _CharT>
+ _Scanner<_CharT>::
+ _Scanner(typename _Scanner::_IterT __begin,
+ typename _Scanner::_IterT __end,
_FlagT __flags, std::locale __loc)
- : _M_state(_S_state_normal), _M_current(__begin), _M_end(__end),
- _M_flags(__flags),
+ : _ScannerBase(__flags),
+ _M_current(__begin), _M_end(__end),
_M_ctype(std::use_facet<_CtypeT>(__loc)),
- _M_at_bracket_start(false),
- _M_token_map
- {
- {'^', _S_token_line_begin},
- {'$', _S_token_line_end},
- {'.', _S_token_anychar},
- {'*', _S_token_closure0},
- {'+', _S_token_closure1},
- {'?', _S_token_opt},
- {'|', _S_token_or},
- // grep and egrep
- {'\n', _S_token_or},
- },
- _M_ecma_escape_map
- {
- {'0', '\0'},
- {'b', '\b'},
- {'f', '\f'},
- {'n', '\n'},
- {'r', '\r'},
- {'t', '\t'},
- {'v', '\v'},
- },
- _M_awk_escape_map
- {
- {'"', '"'},
- {'/', '/'},
- {'\\', '\\'},
- {'a', '\a'},
- {'b', '\b'},
- {'f', '\f'},
- {'n', '\n'},
- {'r', '\r'},
- {'t', '\t'},
- {'v', '\v'},
- },
- _M_ecma_spec_char
- {
- '^',
- '$',
- '\\',
- '.',
- '*',
- '+',
- '?',
- '(',
- ')',
- '[',
- ']',
- '{',
- '}',
- '|',
- },
- _M_basic_spec_char
- {
- '.',
- '[',
- '\\',
- '*',
- '^',
- '$',
- },
- _M_extended_spec_char
- {
- '.',
- '[',
- '\\',
- '(',
- ')',
- '*',
- '+',
- '?',
- '{',
- '|',
- '^',
- '$',
- },
- _M_escape_map(_M_is_ecma()
- ? _M_ecma_escape_map
- : _M_awk_escape_map),
- _M_spec_char(_M_is_ecma()
- ? _M_ecma_spec_char
- : _M_is_basic()
- ? _M_basic_spec_char
- : _M_extended_spec_char),
_M_eat_escape(_M_is_ecma()
? &_Scanner::_M_eat_escape_ecma
: &_Scanner::_M_eat_escape_posix)
{ _M_advance(); }
- template<typename _FwdIter>
+ template<typename _CharT>
void
- _Scanner<_FwdIter>::
+ _Scanner<_CharT>::
_M_advance()
{
if (_M_current == _M_end)
@@ -173,12 +89,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// Differences between styles:
// 1) "\(", "\)", "\{" in basic. It's not escaping.
// 2) "(?:", "(?=", "(?!" in ECMAScript.
- template<typename _FwdIter>
+ template<typename _CharT>
void
- _Scanner<_FwdIter>::
+ _Scanner<_CharT>::
_M_scan_normal()
{
auto __c = *_M_current++;
+ const char* __pos;
if (__c == '\\')
{
@@ -244,11 +161,23 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_M_state = _S_state_in_brace;
_M_token = _S_token_interval_begin;
}
- else if ((_M_spec_char.count(_M_ctype.narrow(__c, '\0'))
+ else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')))
+ != nullptr
+ && *__pos != '\0'
&& __c != ']'
&& __c != '}')
|| (_M_is_grep() && __c == '\n'))
- _M_token = _M_token_map.at(__c);
+ {
+ auto __it = _M_token_tbl;
+ auto __narrowc = _M_ctype.narrow(__c, '\0');
+ for (; __it->first != '\0'; ++__it)
+ if (__it->first == __narrowc)
+ {
+ _M_token = __it->second;
+ return;
+ }
+ _GLIBCXX_DEBUG_ASSERT(false);
+ }
else
{
_M_token = _S_token_ord_char;
@@ -259,9 +188,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// Differences between styles:
// 1) different semantics of "[]" and "[^]".
// 2) Escaping in bracket expr.
- template<typename _FwdIter>
+ template<typename _CharT>
void
- _Scanner<_FwdIter>::
+ _Scanner<_CharT>::
_M_scan_in_bracket()
{
if (_M_current == _M_end)
@@ -316,9 +245,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// Differences between styles:
// 1) "\}" in basic style.
- template<typename _FwdIter>
+ template<typename _CharT>
void
- _Scanner<_FwdIter>::
+ _Scanner<_CharT>::
_M_scan_in_brace()
{
if (_M_current == _M_end)
@@ -357,21 +286,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__throw_regex_error(regex_constants::error_badbrace);
}
- template<typename _FwdIter>
+ template<typename _CharT>
void
- _Scanner<_FwdIter>::
+ _Scanner<_CharT>::
_M_eat_escape_ecma()
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_escape);
auto __c = *_M_current++;
+ auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
- if (_M_escape_map.count(_M_ctype.narrow(__c, '\0'))
- && (__c != 'b' || _M_state == _S_state_in_bracket))
+ if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
{
_M_token = _S_token_ord_char;
- _M_value.assign(1, _M_escape_map.at(__c));
+ _M_value.assign(1, *__pos);
}
else if (__c == 'b')
{
@@ -431,17 +360,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// Differences between styles:
// 1) Extended doesn't support backref, but basic does.
- template<typename _FwdIter>
+ template<typename _CharT>
void
- _Scanner<_FwdIter>::
+ _Scanner<_CharT>::
_M_eat_escape_posix()
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_escape);
auto __c = *_M_current;
+ auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
- if (_M_spec_char.count(_M_ctype.narrow(__c, '\0')))
+ if (__pos != nullptr && *__pos != '\0')
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
@@ -469,17 +399,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
++_M_current;
}
- template<typename _FwdIter>
+ template<typename _CharT>
void
- _Scanner<_FwdIter>::
+ _Scanner<_CharT>::
_M_eat_escape_awk()
{
auto __c = *_M_current++;
+ auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
- if (_M_escape_map.count(_M_ctype.narrow(__c, '\0')))
+ if (__pos != nullptr)
{
_M_token = _S_token_ord_char;
- _M_value.assign(1, _M_escape_map.at(__c));
+ _M_value.assign(1, *__pos);
}
// \ddd for oct representation
else if (_M_ctype.is(_CtypeT::digit, __c)
@@ -505,9 +436,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// Eats a character class or throwns an exception.
// __ch cound be ':', '.' or '=', _M_current is the char after ']' when
// returning.
- template<typename _FwdIter>
+ template<typename _CharT>
void
- _Scanner<_FwdIter>::
+ _Scanner<_CharT>::
_M_eat_class(char __ch)
{
for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
@@ -525,9 +456,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
}
#ifdef _GLIBCXX_DEBUG
- template<typename _FwdIter>
+ template<typename _CharT>
std::ostream&
- _Scanner<_FwdIter>::
+ _Scanner<_CharT>::
_M_print(std::ostream& ostr)
{
switch (_M_token)
diff --git a/libstdc++-v3/include/std/regex b/libstdc++-v3/include/std/regex
index 9395f50..f8a5d02 100644
--- a/libstdc++-v3/include/std/regex
+++ b/libstdc++-v3/include/std/regex
@@ -53,6 +53,7 @@
#include <string>
#include <utility>
#include <vector>
+#include <cstring>
#include <bits/regex_constants.h>
#include <bits/regex_error.h>
More information about the Libstdc++
mailing list