[Patch] Patch set for regex instantiation

Tim Shen timshen91@gmail.com
Mon Jan 13 01:31:00 GMT 2014


On Sun, Jan 12, 2014 at 5:46 PM, Paolo Carlini <paolo.carlini@oracle.com> wrote:
> strchrnul is a non-standard GNU extension, in general we can't use it.

...so use strchr instead.


-- 
Regards,
Tim Shen
-------------- next part --------------
commit 23b583b761ea30d5f66eb2470530ba3dd005d958
Author: tim <timshen91@gmail.com>
Date:   Fri Jan 10 18:06:38 2014 -0500

    2014-01-11  Tim Shen  <timshen91@gmail.com>
    
    	* include/bits/regex_compiler.h: Change _ScannerT into char-type
    	templated.
    	* include/bits/regex_scanner.h (_Scanner<>::_Scanner()): Separate
    	_ScannerBase from _Scanner; Change _Scanner's template argument from
    	_FwdIter to _CharT. Avoid use of std::map and std::set by using arrays
    	instead.
    	* include/bits/regex_scanner.tcc (_Scanner<>::_Scanner(),
    	_Scanner<>::_M_scan_normal(), _Scanner<>::_M_eat_escape_ecma(),
    	_Scanner<>::_M_eat_escape_posix(), _Scanner<>::_M_eat_escape_awk()):
    	Likewise.
    	* include/std/regex: Add <cstring> for using strchr.

diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h
index b73fe30..e692726 100644
--- a/libstdc++-v3/include/bits/regex_compiler.h
+++ b/libstdc++-v3/include/bits/regex_compiler.h
@@ -59,7 +59,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       { return make_shared<_RegexT>(std::move(_M_nfa)); }
 
     private:
-      typedef _Scanner<_FwdIter>                              _ScannerT;
+      typedef _Scanner<typename _TraitsT::char_type>          _ScannerT;
       typedef typename _ScannerT::_TokenT                     _TokenT;
       typedef _StateSeq<_TraitsT>                     	      _StateSeqT;
       typedef std::stack<_StateSeqT, std::vector<_StateSeqT>> _StackT;
diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h
index d113c5d..6dc2b4e 100644
--- a/libstdc++-v3/include/bits/regex_scanner.h
+++ b/libstdc++-v3/include/bits/regex_scanner.h
@@ -39,6 +39,154 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
    * @{
    */
 
+  struct _ScannerBase
+  {
+  public:
+    /// Token types returned from the scanner.
+    enum _TokenT
+    {
+      _S_token_anychar,
+      _S_token_ord_char,
+      _S_token_oct_num,
+      _S_token_hex_num,
+      _S_token_backref,
+      _S_token_subexpr_begin,
+      _S_token_subexpr_no_group_begin,
+      _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n'
+      _S_token_subexpr_end,
+      _S_token_bracket_begin,
+      _S_token_bracket_neg_begin,
+      _S_token_bracket_end,
+      _S_token_interval_begin,
+      _S_token_interval_end,
+      _S_token_quoted_class,
+      _S_token_char_class_name,
+      _S_token_collsymbol,
+      _S_token_equiv_class_name,
+      _S_token_opt,
+      _S_token_or,
+      _S_token_closure0,
+      _S_token_closure1,
+      _S_token_ungreedy,
+      _S_token_line_begin,
+      _S_token_line_end,
+      _S_token_word_bound, // neg if _M_value[0] == 'n'
+      _S_token_comma,
+      _S_token_dup_count,
+      _S_token_eof,
+      _S_token_unknown
+    };
+
+  protected:
+    typedef regex_constants::syntax_option_type _FlagT;
+
+    enum _StateT
+    {
+      _S_state_normal,
+      _S_state_in_brace,
+      _S_state_in_bracket,
+    };
+
+  protected:
+    _ScannerBase(_FlagT __flags)
+    : _M_state(_S_state_normal),
+    _M_flags(__flags),
+    _M_escape_tbl(_M_is_ecma()
+		  ? _M_ecma_escape_tbl
+		  : _M_awk_escape_tbl),
+    _M_spec_char(_M_is_ecma()
+		 ? _M_ecma_spec_char
+		 : _M_is_basic()
+		 ? _M_basic_spec_char
+		 : _M_extended_spec_char),
+    _M_at_bracket_start(false)
+    { }
+
+  protected:
+    const char*
+    _M_find_escape(char __c)
+    {
+      auto __it = _M_escape_tbl;
+      for (; __it->first != '\0'; ++__it)
+	if (__it->first == __c)
+	  return &__it->second;
+      return nullptr;
+    }
+
+    bool
+    _M_is_ecma() const
+    { return _M_flags & regex_constants::ECMAScript; }
+
+    bool
+    _M_is_basic() const
+    { return _M_flags & (regex_constants::basic | regex_constants::grep); }
+
+    bool
+    _M_is_extended() const
+    {
+      return _M_flags & (regex_constants::extended
+			 | regex_constants::egrep
+			 | regex_constants::awk);
+    }
+
+    bool
+    _M_is_grep() const
+    { return _M_flags & (regex_constants::grep | regex_constants::egrep); }
+
+    bool
+    _M_is_awk() const
+    { return _M_flags & regex_constants::awk; }
+
+  protected:
+    const std::pair<char, _TokenT> _M_token_tbl[9] =
+      {
+	{'^', _S_token_line_begin},
+	{'$', _S_token_line_end},
+	{'.', _S_token_anychar},
+	{'*', _S_token_closure0},
+	{'+', _S_token_closure1},
+	{'?', _S_token_opt},
+	{'|', _S_token_or},
+	{'\n', _S_token_or}, // grep and egrep
+	{'\0', _S_token_or},
+      };
+    const std::pair<char, char> _M_ecma_escape_tbl[8] =
+      {
+	{'0', '\0'},
+	{'b', '\b'},
+	{'f', '\f'},
+	{'n', '\n'},
+	{'r', '\r'},
+	{'t', '\t'},
+	{'v', '\v'},
+	{'\0', '\0'},
+      };
+    const std::pair<char, char> _M_awk_escape_tbl[11] =
+      {
+	{'"', '"'},
+	{'/', '/'},
+	{'\\', '\\'},
+	{'a', '\a'},
+	{'b', '\b'},
+	{'f', '\f'},
+	{'n', '\n'},
+	{'r', '\r'},
+	{'t', '\t'},
+	{'v', '\v'},
+	{'\0', '\0'},
+      };
+    const char* _M_ecma_spec_char = "^$\\.*+?()[]{}|";
+    const char* _M_basic_spec_char = ".[\\*^$";
+    const char* _M_extended_spec_char = ".[\\()*+?{|^$";
+
+    _StateT                       _M_state;
+    _FlagT                        _M_flags;
+    _TokenT                       _M_token;
+    const std::pair<char, char>*  _M_escape_tbl;
+    const char*                   _M_spec_char;
+    bool                          _M_at_bracket_start;
+  };
+
   /**
    * @brief struct _Scanner. Scans an input range for regex tokens.
    *
@@ -49,51 +197,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
    * constructor: different regular expression grammars will interpret
    * the same input pattern in syntactically different ways.
    */
-  template<typename _FwdIter>
+  template<typename _CharT>
     class _Scanner
+    : public _ScannerBase
     {
     public:
-      typedef typename std::iterator_traits<_FwdIter>::value_type _CharT;
+      typedef const _CharT*                                       _IterT;
       typedef std::basic_string<_CharT>                           _StringT;
       typedef regex_constants::syntax_option_type                 _FlagT;
       typedef const std::ctype<_CharT>                            _CtypeT;
 
-      /// Token types returned from the scanner.
-      enum _TokenT
-      {
-	_S_token_anychar,
-	_S_token_ord_char,
-	_S_token_oct_num,
-	_S_token_hex_num,
-	_S_token_backref,
-	_S_token_subexpr_begin,
-	_S_token_subexpr_no_group_begin,
-	_S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n'
-	_S_token_subexpr_end,
-	_S_token_bracket_begin,
-	_S_token_bracket_neg_begin,
-	_S_token_bracket_end,
-	_S_token_interval_begin,
-	_S_token_interval_end,
-	_S_token_quoted_class,
-	_S_token_char_class_name,
-	_S_token_collsymbol,
-	_S_token_equiv_class_name,
-	_S_token_opt,
-	_S_token_or,
-	_S_token_closure0,
-	_S_token_closure1,
-	_S_token_ungreedy,
-	_S_token_line_begin,
-	_S_token_line_end,
-	_S_token_word_bound, // neg if _M_value[0] == 'n'
-	_S_token_comma,
-	_S_token_dup_count,
-	_S_token_eof,
-	_S_token_unknown
-      };
-
-      _Scanner(_FwdIter __begin, _FwdIter __end,
+      _Scanner(_IterT __begin, _IterT __end,
 	       _FlagT __flags, std::locale __loc);
 
       void
@@ -113,13 +227,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #endif
 
     private:
-      enum _StateT
-      {
-	_S_state_normal,
-	_S_state_in_brace,
-	_S_state_in_bracket,
-      };
-
       void
       _M_scan_normal();
 
@@ -141,49 +248,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       void
       _M_eat_class(char);
 
-      constexpr bool
-      _M_is_ecma()
-      { return _M_flags & regex_constants::ECMAScript; }
-
-      constexpr bool
-      _M_is_basic()
-      { return _M_flags & (regex_constants::basic | regex_constants::grep); }
-
-      constexpr bool
-      _M_is_extended()
-      {
-	return _M_flags & (regex_constants::extended
-			   | regex_constants::egrep
-			   | regex_constants::awk);
-      }
-
-      constexpr bool
-      _M_is_grep()
-      { return _M_flags & (regex_constants::grep | regex_constants::egrep); }
-
-      constexpr bool
-      _M_is_awk()
-      { return _M_flags & regex_constants::awk; }
-
-      _StateT                       _M_state;
-      _FwdIter                      _M_current;
-      _FwdIter                      _M_end;
-      _FlagT                        _M_flags;
+      _IterT                        _M_current;
+      _IterT                        _M_end;
       _CtypeT&                      _M_ctype;
-      _TokenT                       _M_token;
       _StringT                      _M_value;
-      bool                          _M_at_bracket_start;
-    public:
-      // FIXME: make them static when this file is stable.
-      const std::map<char, _TokenT> _M_token_map;
-      const std::map<char, char>    _M_ecma_escape_map;
-      const std::map<char, char>    _M_awk_escape_map;
-      const std::set<char>          _M_ecma_spec_char;
-      const std::set<char>          _M_basic_spec_char;
-      const std::set<char>          _M_extended_spec_char;
-
-      const std::map<char, char>&   _M_escape_map;
-      const std::set<char>&         _M_spec_char;
       void (_Scanner::* _M_eat_escape)();
     };
 
diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc
index 34d78ec..403b4c0 100644
--- a/libstdc++-v3/include/bits/regex_scanner.tcc
+++ b/libstdc++-v3/include/bits/regex_scanner.tcc
@@ -52,106 +52,22 @@ namespace __detail
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
-  template<typename _FwdIter>
-    _Scanner<_FwdIter>::
-    _Scanner(_FwdIter __begin, _FwdIter __end,
+  template<typename _CharT>
+    _Scanner<_CharT>::
+    _Scanner(typename _Scanner::_IterT __begin,
+	     typename _Scanner::_IterT __end,
 	     _FlagT __flags, std::locale __loc)
-    : _M_state(_S_state_normal), _M_current(__begin), _M_end(__end),
-      _M_flags(__flags),
+    : _ScannerBase(__flags),
+      _M_current(__begin), _M_end(__end),
       _M_ctype(std::use_facet<_CtypeT>(__loc)),
-      _M_at_bracket_start(false),
-      _M_token_map
-	{
-	  {'^', _S_token_line_begin},
-	  {'$', _S_token_line_end},
-	  {'.', _S_token_anychar},
-	  {'*', _S_token_closure0},
-	  {'+', _S_token_closure1},
-	  {'?', _S_token_opt},
-	  {'|', _S_token_or},
-	  // grep and egrep
-	  {'\n', _S_token_or},
-	},
-      _M_ecma_escape_map
-	{
-	  {'0', '\0'},
-	  {'b', '\b'},
-	  {'f', '\f'},
-	  {'n', '\n'},
-	  {'r', '\r'},
-	  {'t', '\t'},
-	  {'v', '\v'},
-	},
-      _M_awk_escape_map
-	{
-	  {'"', '"'},
-	  {'/', '/'},
-	  {'\\', '\\'},
-	  {'a', '\a'},
-	  {'b', '\b'},
-	  {'f', '\f'},
-	  {'n', '\n'},
-	  {'r', '\r'},
-	  {'t', '\t'},
-	  {'v', '\v'},
-	},
-      _M_ecma_spec_char
-	{
-	  '^',
-	  '$',
-	  '\\',
-	  '.',
-	  '*',
-	  '+',
-	  '?',
-	  '(',
-	  ')',
-	  '[',
-	  ']',
-	  '{',
-	  '}',
-	  '|',
-	},
-      _M_basic_spec_char
-	{
-	  '.',
-	  '[',
-	  '\\',
-	  '*',
-	  '^',
-	  '$',
-	},
-      _M_extended_spec_char
-	{
-	  '.',
-	  '[',
-	  '\\',
-	  '(',
-	  ')',
-	  '*',
-	  '+',
-	  '?',
-	  '{',
-	  '|',
-	  '^',
-	  '$',
-	},
-      _M_escape_map(_M_is_ecma()
-		    ? _M_ecma_escape_map
-		    : _M_awk_escape_map),
-      _M_spec_char(_M_is_ecma()
-		   ? _M_ecma_spec_char
-		   : _M_is_basic()
-		   ? _M_basic_spec_char
-		   : _M_extended_spec_char),
       _M_eat_escape(_M_is_ecma()
 		    ? &_Scanner::_M_eat_escape_ecma
 		    : &_Scanner::_M_eat_escape_posix)
     { _M_advance(); }
 
-  template<typename _FwdIter>
+  template<typename _CharT>
     void
-    _Scanner<_FwdIter>::
+    _Scanner<_CharT>::
     _M_advance()
     {
       if (_M_current == _M_end)
@@ -173,12 +89,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   // Differences between styles:
   // 1) "\(", "\)", "\{" in basic. It's not escaping.
   // 2) "(?:", "(?=", "(?!" in ECMAScript.
-  template<typename _FwdIter>
+  template<typename _CharT>
     void
-    _Scanner<_FwdIter>::
+    _Scanner<_CharT>::
     _M_scan_normal()
     {
       auto __c = *_M_current++;
+      const char* __pos;
 
       if (__c == '\\')
 	{
@@ -244,11 +161,23 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  _M_state = _S_state_in_brace;
 	  _M_token = _S_token_interval_begin;
 	}
-      else if ((_M_spec_char.count(_M_ctype.narrow(__c, '\0'))
+      else if (((__pos = strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')))
+		  != nullptr
+		&& *__pos != '\0'
 		&& __c != ']'
 		&& __c != '}')
 	       || (_M_is_grep() && __c == '\n'))
-	_M_token = _M_token_map.at(__c);
+	{
+	  auto __it = _M_token_tbl;
+	  auto __narrowc = _M_ctype.narrow(__c, '\0');
+	  for (; __it->first != '\0'; ++__it)
+	    if (__it->first == __narrowc)
+	      {
+		_M_token = __it->second;
+		return;
+	      }
+	  _GLIBCXX_DEBUG_ASSERT(false);
+	}
       else
 	{
 	  _M_token = _S_token_ord_char;
@@ -259,9 +188,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   // Differences between styles:
   // 1) different semantics of "[]" and "[^]".
   // 2) Escaping in bracket expr.
-  template<typename _FwdIter>
+  template<typename _CharT>
     void
-    _Scanner<_FwdIter>::
+    _Scanner<_CharT>::
     _M_scan_in_bracket()
     {
       if (_M_current == _M_end)
@@ -316,9 +245,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   // Differences between styles:
   // 1) "\}" in basic style.
-  template<typename _FwdIter>
+  template<typename _CharT>
     void
-    _Scanner<_FwdIter>::
+    _Scanner<_CharT>::
     _M_scan_in_brace()
     {
       if (_M_current == _M_end)
@@ -357,21 +286,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	__throw_regex_error(regex_constants::error_badbrace);
     }
 
-  template<typename _FwdIter>
+  template<typename _CharT>
     void
-    _Scanner<_FwdIter>::
+    _Scanner<_CharT>::
     _M_eat_escape_ecma()
     {
       if (_M_current == _M_end)
 	__throw_regex_error(regex_constants::error_escape);
 
       auto __c = *_M_current++;
+      auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
 
-      if (_M_escape_map.count(_M_ctype.narrow(__c, '\0'))
-	  && (__c != 'b' || _M_state == _S_state_in_bracket))
+      if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
 	{
 	  _M_token = _S_token_ord_char;
-	  _M_value.assign(1, _M_escape_map.at(__c));
+	  _M_value.assign(1, *__pos);
 	}
       else if (__c == 'b')
 	{
@@ -431,17 +360,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   // Differences between styles:
   // 1) Extended doesn't support backref, but basic does.
-  template<typename _FwdIter>
+  template<typename _CharT>
     void
-    _Scanner<_FwdIter>::
+    _Scanner<_CharT>::
     _M_eat_escape_posix()
     {
       if (_M_current == _M_end)
 	__throw_regex_error(regex_constants::error_escape);
 
       auto __c = *_M_current;
+      auto __pos = strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
 
-      if (_M_spec_char.count(_M_ctype.narrow(__c, '\0')))
+      if (__pos != nullptr && *__pos != '\0')
 	{
 	  _M_token = _S_token_ord_char;
 	  _M_value.assign(1, __c);
@@ -469,17 +399,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       ++_M_current;
     }
 
-  template<typename _FwdIter>
+  template<typename _CharT>
     void
-    _Scanner<_FwdIter>::
+    _Scanner<_CharT>::
     _M_eat_escape_awk()
     {
       auto __c = *_M_current++;
+      auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
 
-      if (_M_escape_map.count(_M_ctype.narrow(__c, '\0')))
+      if (__pos != nullptr)
 	{
 	  _M_token = _S_token_ord_char;
-	  _M_value.assign(1, _M_escape_map.at(__c));
+	  _M_value.assign(1, *__pos);
 	}
       // \ddd for oct representation
       else if (_M_ctype.is(_CtypeT::digit, __c)
@@ -505,9 +436,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   // Eats a character class or throwns an exception.
   // __ch cound be ':', '.' or '=', _M_current is the char after ']' when
   // returning.
-  template<typename _FwdIter>
+  template<typename _CharT>
     void
-    _Scanner<_FwdIter>::
+    _Scanner<_CharT>::
     _M_eat_class(char __ch)
     {
       for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
@@ -525,9 +456,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     }
 
 #ifdef _GLIBCXX_DEBUG
-  template<typename _FwdIter>
+  template<typename _CharT>
     std::ostream&
-    _Scanner<_FwdIter>::
+    _Scanner<_CharT>::
     _M_print(std::ostream& ostr)
     {
       switch (_M_token)
diff --git a/libstdc++-v3/include/std/regex b/libstdc++-v3/include/std/regex
index 9395f50..f8a5d02 100644
--- a/libstdc++-v3/include/std/regex
+++ b/libstdc++-v3/include/std/regex
@@ -53,6 +53,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <cstring>
 
 #include <bits/regex_constants.h>
 #include <bits/regex_error.h>


More information about the Libstdc++ mailing list