This is the mail archive of the
libstdc++@gcc.gnu.org
mailing list for the libstdc++ project.
[Patch] Regex comments
- From: Tim Shen <timshen91 at gmail dot com>
- To: "libstdc++" <libstdc++ at gcc dot gnu dot org>, gcc-patches <gcc-patches at gcc dot gnu dot org>
- Date: Sun, 27 Oct 2013 16:12:11 -0400
- Subject: [Patch] Regex comments
- Authentication-results: sourceware.org; auth=none
Add comments for last regex commit.
Thanks!
--
Regards,
Tim Shen
commit b4aa16a08992866ca6fd60f40e422f551d0d6b2a
Author: tim <timshen91@gmail.com>
Date: Sun Oct 27 15:50:44 2013 -0400
2013-
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index d3b9a04..2f677de 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -53,6 +53,52 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return false;
}
+ // This function serves in different modes, DFS mode or BFS mode, indicated
+ // by template variable __dfs_mode. See _M_main for details.
+ //
+ // ------------------------------------------------------------
+ //
+ // DFS mode:
+ //
+ // It applys a Depth-First-Search (aka backtracking) on given NFA and input
+ // string.
+ // At the very beginning the executor stands in the start state, then it try
+ // every possible state transition in current state recursively. Some state
+ // transitions consume input string, say, a single-char-matcher or a
+ // back-reference matcher; some don't, like assertion or other anchor nodes.
+ // When the input is exhausted and/or the current state is an accepting state,
+ // the whole executor returns true.
+ //
+ // TODO: This approach is exponentially slow for certain input.
+ // Try to compile the NFA to a DFA.
+ //
+ // Time complexity: o(match_length), O(2^(_M_nfa.size()))
+ // Space complexity: \theta(match_results.size() + match_length)
+ //
+ // ------------------------------------------------------------
+ //
+ // BFS mode:
+ //
+ // Russ Cox's article (http://swtch.com/~rsc/regexp/regexp1.html)
+ // explained this algorithm clearly.
+ //
+ // It first computes epsilon clousure for every state that's still matching,
+ // using the same DFS algorithm, but doesn't reenter states (set true in
+ // _M_visited), nor follow _S_opcode_match.
+ //
+ // Then apply DFS to every _S_opcode_match one by one (in _M_match_queue).
+ //
+ // The order of which states needs to be recursively applied DFS matters,
+ // depend on which greedy mode we use. See _S_opcode_alternative branch in
+ // _M_dfs.
+ //
+ // It significantly reduces potential duplicate states, so have a better
+ // upper bound; but it deserves more overhead.
+ //
+ // Time complexity: o(match_length * match_results.size())
+ // O(match_length * _M_nfa.size() * match_results.size())
+ // Space complexity: o(_M_nfa.size() + match_results.size())
+ // O(_M_nfa.size() * match_results.size())
template<typename _BiIter, typename _Alloc, typename _TraitsT,
bool __dfs_mode>
template<bool __match_mode>
@@ -68,18 +114,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
}
else
{
- // Like the DFS approach, it try every possible state transition;
- // Unlike DFS, it uses a queue instead of a stack to store matching
- // states. It's a BFS approach.
- //
- // Russ Cox's article(http://swtch.com/~rsc/regexp/regexp1.html)
- // explained this algorithm clearly.
- //
- // Time complexity: o(match_length * match_results.size())
- // O(match_length * _M_nfa.size()
- // * match_results.size())
- // Space complexity: o(_M_nfa.size() + match_results.size())
- // O(_M_nfa.size() * match_results.size())
_M_match_queue->push(make_pair(_M_start_state, _M_results));
bool __ret = false;
while (1)
@@ -132,20 +166,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return false;
}
- // A _DFSExecutor perform a DFS on given NFA and input string. At the very
- // beginning the executor stands in the start state, then it try every
- // possible state transition in current state recursively. Some state
- // transitions consume input string, say, a single-char-matcher or a
- // back-reference matcher; some not, like assertion or other anchor nodes.
- // When the input is exhausted and the current state is an accepting state,
- // the whole executor return true.
- //
- // TODO: This approach is exponentially slow for certain input.
- // Try to compile the NFA to a DFA.
- //
- // Time complexity: o(match_length), O(2^(_M_nfa.size()))
- // Space complexity: \theta(match_results.size() + match_length)
- //
template<typename _BiIter, typename _Alloc, typename _TraitsT,
bool __dfs_mode>
template<bool __match_mode>
@@ -164,25 +184,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
case _S_opcode_alternative:
// Greedy or not, this is a question ;)
+ //
+ // _M_alt branch is "match once more", while _M_next is "get me out
+ // of this quantifier".
+
+ // Greedy.
if (!__state._M_neg)
{
+ // Once more.
_M_dfs<__match_mode>(__state._M_alt);
+ // If it's DFS executor and already accepted, we're done.
if (!__dfs_mode || !_M_has_sol)
_M_dfs<__match_mode>(__state._M_next);
}
- else
+ else // Ungreedy mode
{
if (__dfs_mode)
{
+ // vice-versa.
_M_dfs<__match_mode>(__state._M_next);
if (!_M_has_sol)
_M_dfs<__match_mode>(__state._M_alt);
}
else
{
+ // DON't attempt anything, because there's already someone
+ // with higher priority accepted. This state cannot be better
+ // by attempting its next node.
if (!_M_has_sol)
{
_M_dfs<__match_mode>(__state._M_next);
+ // DON'T attempt anything if it's already accepted. An
+ // accepted state *must* be better than a solution that
+ // has one more time ungreedy quantifier loop.
if (!_M_has_sol)
_M_dfs<__match_mode>(__state._M_alt);
}
@@ -190,9 +224,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
}
break;
case _S_opcode_subexpr_begin:
- // Here's the critical part: if there's nothing changed since last
- // visit, do NOT continue. This prevents the executor from get into
- // infinite loop when use "()*" to match "".
+ // If there's nothing changed since last visit, do NOT continue.
+ // This prevents the executor from get into infinite loop when use
+ // "()*" to match "".
//
// Every change on _M_cur_results will be roll back after the
// recursion step finished.
@@ -232,8 +266,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
if (_M_word_boundry(__state) == !__state._M_neg)
_M_dfs<__match_mode>(__state._M_next);
break;
- // Here __state._M_alt offers a single start node for a sub-NFA.
- // We recursivly invoke our algorithm to match the sub-NFA.
+ // Here __state._M_alt offers a single start node for a sub-NFA.
+ // We recursively invoke our algorithm to match the sub-NFA.
case _S_opcode_subexpr_lookahead:
if (_M_lookahead(__state) == !__state._M_neg)
_M_dfs<__match_mode>(__state._M_next);