[Patch 3/3] Separate `repeating` node from `alternative` node in regex

Tim Shen timshen91@gmail.com
Fri Apr 25 22:14:00 GMT 2014


Patch attached.


-- 
Regards,
Tim Shen
-------------- next part --------------
commit cd0683e3a3eae0c62d2867fb5456edb3735b38ae
Author: tim <timshen91@gmail.com>
Date:   Fri Apr 25 10:45:26 2014 -0400

    2014-04-25  Tim Shen  <timshen91@gmail.com>
    
    	* include/bits/regex_automaton.h (_NFA<>::_M_insert_repeat):
    	Add _S_opcode_repeat support to distingush a loop from
    	_S_opcode_alternative.
    	* include/bits/regex_automaton.tcc (_State_base::_M_print,
    	_State_base::_M_dot, _NFA<>::_M_eliminate_dummy,
    	_StateSeq<>::_M_clone): Likewise.
    	* include/bits/regex_compiler.tcc (_Compiler<>::_M_quantifier):
    	Likewise.
    	* include/bits/regex_executor.tcc (_Executor<>::_M_dfs): Likewise.
    	* include/bits/regex_scanner.tcc (_Scanner<>::_M_eat_escape_ecma):
    	Uglify local variable __i.
    	* include/bits/regex_compiler.h (_BracketMatcher<>::_M_make_cache):
    	Use size_t instead of int to compare with vector::size().

diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h
index 64ecd6d..1b0da14 100644
--- a/libstdc++-v3/include/bits/regex_automaton.h
+++ b/libstdc++-v3/include/bits/regex_automaton.h
@@ -52,6 +52,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   {
       _S_opcode_unknown,
       _S_opcode_alternative,
+      _S_opcode_repeat,
       _S_opcode_backref,
       _S_opcode_line_begin_assertion,
       _S_opcode_line_end_assertion,
@@ -74,7 +75,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       size_t _M_backref_index;  // for _S_opcode_backref
       struct
       {
-	// for _S_opcode_alternative or _S_opcode_subexpr_lookahead
+	// for _S_opcode_alternative, _S_opcode_repeat and
+	// _S_opcode_subexpr_lookahead
 	_StateIdT  _M_alt;
 	// for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
 	// quantifiers (ungreedy if set true)
@@ -174,6 +176,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	// approach.
 	__tmp._M_next = __next;
 	__tmp._M_alt = __alt;
+	return _M_insert_state(std::move(__tmp));
+      }
+
+      _StateIdT
+      _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg)
+      {
+	_StateT __tmp(_S_opcode_repeat);
+	// It labels every quantifier to make greedy comparison easier in BFS
+	// approach.
+	__tmp._M_next = __next;
+	__tmp._M_alt = __alt;
 	__tmp._M_neg = __neg;
 	return _M_insert_state(std::move(__tmp));
       }
diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc
index 38787fa..e0ac3f9 100644
--- a/libstdc++-v3/include/bits/regex_automaton.tcc
+++ b/libstdc++-v3/include/bits/regex_automaton.tcc
@@ -41,6 +41,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     switch (_M_opcode)
     {
       case _S_opcode_alternative:
+      case _S_opcode_repeat:
 	ostr << "alt next=" << _M_next << " alt=" << _M_alt;
 	break;
       case _S_opcode_subexpr_begin:
@@ -72,6 +73,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     switch (_M_opcode)
     {
       case _S_opcode_alternative:
+      case _S_opcode_repeat:
 	__ostr << __id << " [label=\"" << __id << "\\nALT\"];\n"
 	       << __id << " -> " << _M_next
 	       << " [label=\"next\", tailport=\"s\"];\n"
@@ -174,6 +176,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 		 == _S_opcode_dummy)
 	    __it._M_next = (*this)[__it._M_next]._M_next;
 	  if (__it._M_opcode == _S_opcode_alternative
+	      || __it._M_opcode == _S_opcode_repeat
 	      || __it._M_opcode == _S_opcode_subexpr_lookahead)
 	    while (__it._M_alt >= 0 && (*this)[__it._M_alt]._M_opcode
 		   == _S_opcode_dummy)
@@ -198,6 +201,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  auto __id = _M_nfa._M_insert_state(__dup);
 	  __m[__u] = __id;
 	  if (__dup._M_opcode == _S_opcode_alternative
+	      || __dup._M_opcode == _S_opcode_repeat
 	      || __dup._M_opcode == _S_opcode_subexpr_lookahead)
 	    if (__dup._M_alt != _S_invalid_state_id && __m[__dup._M_alt] == -1)
 	      __stack.push(__dup._M_alt);
@@ -217,6 +221,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	      __ref._M_next = __m[__ref._M_next];
 	    }
 	  if (__ref._M_opcode == _S_opcode_alternative
+	      || __ref._M_opcode == _S_opcode_repeat
 	      || __ref._M_opcode == _S_opcode_subexpr_lookahead)
 	    if (__ref._M_alt != _S_invalid_state_id)
 	      {
diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h
index f5a198f..d7e2162 100644
--- a/libstdc++-v3/include/bits/regex_compiler.h
+++ b/libstdc++-v3/include/bits/regex_compiler.h
@@ -421,7 +421,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       void
       _M_make_cache(true_type)
       {
-	for (int __i = 0; __i < _M_cache.size(); __i++)
+	for (size_t __i = 0; __i < _M_cache.size(); __i++)
 	  _M_cache[static_cast<_UnsignedCharT>(__i)] =
 	    _M_apply(__i, false_type());
       }
diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc
index 128dac1..3cf9e457 100644
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -188,8 +188,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	{
 	  __init();
 	  auto __e = _M_pop();
-	  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
-						      __e._M_start, __neg));
+	  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_repeat(_S_invalid_state_id,
+							 __e._M_start, __neg));
 	  __e._M_append(__r);
 	  _M_stack.push(__r);
 	}
@@ -197,8 +197,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	{
 	  __init();
 	  auto __e = _M_pop();
-	  __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start,
-					     __neg));
+	  __e._M_append(_M_nfa._M_insert_repeat(_S_invalid_state_id,
+						__e._M_start, __neg));
 	  _M_stack.push(__e);
 	}
       else if (_M_match_token(_ScannerT::_S_token_opt))
@@ -206,8 +206,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  __init();
 	  auto __e = _M_pop();
 	  auto __end = _M_nfa._M_insert_dummy();
-	  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
-						      __e._M_start, __neg));
+	  _StateSeqT __r(_M_nfa, _M_nfa._M_insert_repeat(_S_invalid_state_id,
+							 __e._M_start, __neg));
 	  __e._M_append(__end);
 	  __r._M_append(__end);
 	  _M_stack.push(__r);
@@ -244,8 +244,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	    {
 	      auto __tmp = __r._M_clone();
 	      _StateSeqT __s(_M_nfa,
-			     _M_nfa._M_insert_alt(_S_invalid_state_id,
-						  __tmp._M_start, __neg));
+			     _M_nfa._M_insert_repeat(_S_invalid_state_id,
+						     __tmp._M_start, __neg));
 	      __tmp._M_append(__s);
 	      __e._M_append(__s);
 	    }
@@ -261,8 +261,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	      for (long __i = 0; __i < __n; ++__i)
 		{
 		  auto __tmp = __r._M_clone();
-		  auto __alt = _M_nfa._M_insert_alt(__tmp._M_start,
-						    __end, __neg);
+		  auto __alt = _M_nfa._M_insert_repeat(__tmp._M_start,
+						       __end, __neg);
 		  __stack.push(__alt);
 		  __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end));
 		}
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index bb10a72..6dd4d25 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -194,7 +194,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	}
     };
 
-  // TODO: Use a function vector to dispatch, instead of using switch-case.
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
     bool __dfs_mode>
   template<bool __match_mode>
@@ -217,7 +216,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	// of this quantifier". Executing _M_next first or _M_alt first don't
 	// mean the same thing, and we need to choose the correct order under
 	// given greedy mode.
-	case _S_opcode_alternative:
+	case _S_opcode_repeat:
 	  {
 	    // Greedy.
 	    if (!__state._M_neg)
@@ -364,6 +363,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 		  }
 	    }
 	  break;
+	case _S_opcode_alternative:
+	  _M_dfs<__match_mode>(__state._M_alt);
+	  if (!__dfs_mode || !_M_has_sol)
+	    _M_dfs<__match_mode>(__state._M_next);
+	  break;
 	default:
 	  _GLIBCXX_DEBUG_ASSERT(false);
 	}
diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc
index 5332d2e..f501ff6 100644
--- a/libstdc++-v3/include/bits/regex_scanner.tcc
+++ b/libstdc++-v3/include/bits/regex_scanner.tcc
@@ -335,7 +335,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       else if (__c == 'x' || __c == 'u')
 	{
 	  _M_value.erase();
-	  for (int i = 0; i < (__c == 'x' ? 2 : 4); i++)
+	  for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
 	    {
 	      if (_M_current == _M_end
 		  || !_M_ctype.is(_CtypeT::xdigit, *_M_current))


More information about the Gcc-patches mailing list