This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [Patch] Regex comments

From: Tim Shen <timshen91 at gmail dot com>
To: Paolo Carlini <paolo dot carlini at oracle dot com>
Cc: "libstdc++" <libstdc++ at gcc dot gnu dot org>, gcc-patches <gcc-patches at gcc dot gnu dot org>
Date: Sun, 27 Oct 2013 16:28:16 -0400
Subject: Re: [Patch] Regex comments
Authentication-results: sourceware.org; auth=none
References: <CAPrifDkTYZd3zNxpast3MqiUzp5u8L1=ymAJY6qkQXu6-ywV1A at mail dot gmail dot com> <526D75A5 dot 7010508 at oracle dot com>

On Sun, Oct 27, 2013 at 4:20 PM, Paolo Carlini <paolo.carlini@oracle.com> wrote:
> Should we move this comment too before the 'case', and aligned with it?

Actually this comment is for the if statement below, not the whole
case branch. I've made this clearer by moving "Every change..." part
to the switch statement, where it should stay.


-- 
Regards,
Tim Shen

commit 1f13761ba34b13a52b4f41d2ebdeba1114205bac
Author: tim <timshen91@gmail.com>
Date:   Sun Oct 27 15:50:44 2013 -0400

    2013-10-27  Tim Shen  <timshen91@gmail.com>
    
    	* regex_executor.tcc: Add comments.

diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index d3b9a04..c4ce362 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -53,6 +53,52 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       return false;
     }
 
+  // This function serves in different modes, DFS mode or BFS mode, indicated
+  // by template variable __dfs_mode. See _M_main for details.
+  //
+  // ------------------------------------------------------------
+  //
+  // DFS mode:
+  //
+  // It applys a Depth-First-Search (aka backtracking) on given NFA and input
+  // string.
+  // At the very beginning the executor stands in the start state, then it try
+  // every possible state transition in current state recursively. Some state
+  // transitions consume input string, say, a single-char-matcher or a
+  // back-reference matcher; some don't, like assertion or other anchor nodes.
+  // When the input is exhausted and/or the current state is an accepting state,
+  // the whole executor returns true.
+  //
+  // TODO: This approach is exponentially slow for certain input.
+  //       Try to compile the NFA to a DFA.
+  //
+  // Time complexity: o(match_length), O(2^(_M_nfa.size()))
+  // Space complexity: \theta(match_results.size() + match_length)
+  //
+  // ------------------------------------------------------------
+  //
+  // BFS mode:
+  //
+  // Russ Cox's article (http://swtch.com/~rsc/regexp/regexp1.html)
+  // explained this algorithm clearly.
+  //
+  // It first computes epsilon clousure for every state that's still matching,
+  // using the same DFS algorithm, but doesn't reenter states (set true in
+  // _M_visited), nor follow _S_opcode_match.
+  //
+  // Then apply DFS to every _S_opcode_match one by one (in _M_match_queue).
+  //
+  // The order of which states needs to be recursively applied DFS matters,
+  // depend on which greedy mode we use. See _S_opcode_alternative branch in
+  // _M_dfs.
+  //
+  // It significantly reduces potential duplicate states, so have a better
+  // upper bound; but it deserves more overhead.
+  //
+  // Time complexity: o(match_length * match_results.size())
+  //                  O(match_length * _M_nfa.size() * match_results.size())
+  // Space complexity: o(_M_nfa.size() + match_results.size())
+  //                   O(_M_nfa.size() * match_results.size())
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
     bool __dfs_mode>
   template<bool __match_mode>
@@ -68,18 +114,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	}
       else
 	{
-	  // Like the DFS approach, it try every possible state transition;
-	  // Unlike DFS, it uses a queue instead of a stack to store matching
-	  // states. It's a BFS approach.
-	  //
-	  // Russ Cox's article(http://swtch.com/~rsc/regexp/regexp1.html)
-	  // explained this algorithm clearly.
-	  //
-	  // Time complexity: o(match_length * match_results.size())
-	  //                  O(match_length * _M_nfa.size()
-	  //                    * match_results.size())
-	  // Space complexity: o(_M_nfa.size() + match_results.size())
-	  //                   O(_M_nfa.size() * match_results.size())
 	  _M_match_queue->push(make_pair(_M_start_state, _M_results));
 	  bool __ret = false;
 	  while (1)
@@ -132,20 +166,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       return false;
     }
 
-  // A _DFSExecutor perform a DFS on given NFA and input string. At the very
-  // beginning the executor stands in the start state, then it try every
-  // possible state transition in current state recursively. Some state
-  // transitions consume input string, say, a single-char-matcher or a
-  // back-reference matcher; some not, like assertion or other anchor nodes.
-  // When the input is exhausted and the current state is an accepting state,
-  // the whole executor return true.
-  //
-  // TODO: This approach is exponentially slow for certain input.
-  //       Try to compile the NFA to a DFA.
-  //
-  // Time complexity: o(match_length), O(2^(_M_nfa.size()))
-  // Space complexity: \theta(match_results.size() + match_length)
-  //
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
     bool __dfs_mode>
   template<bool __match_mode>
@@ -160,29 +180,45 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	}
 
       const auto& __state = _M_nfa[__i];
+      // Every change on _M_cur_results and _M_current will be rolled back after
+      // finishing the recursion step.
       switch (__state._M_opcode)
 	{
 	case _S_opcode_alternative:
 	  // Greedy or not, this is a question ;)
+	  //
+	  // _M_alt branch is "match once more", while _M_next is "get me out
+	  // of this quantifier".
+
+	  // Greedy.
 	  if (!__state._M_neg)
 	    {
+	      // Once more.
 	      _M_dfs<__match_mode>(__state._M_alt);
+	      // If it's DFS executor and already accepted, we're done.
 	      if (!__dfs_mode || !_M_has_sol)
 		_M_dfs<__match_mode>(__state._M_next);
 	    }
-	  else
+	  else // Ungreedy mode
 	    {
 	      if (__dfs_mode)
 		{
+		  // vice-versa.
 		  _M_dfs<__match_mode>(__state._M_next);
 		  if (!_M_has_sol)
 		    _M_dfs<__match_mode>(__state._M_alt);
 		}
 	      else
 		{
+		  // DON't attempt anything, because there's already someone
+		  // with higher priority accepted. This state cannot be better
+		  // by attempting its next node.
 		  if (!_M_has_sol)
 		    {
 		      _M_dfs<__match_mode>(__state._M_next);
+		      // DON'T attempt anything if it's already accepted. An
+		      // accepted state *must* be better than a solution that
+		      // has one more time ungreedy quantifier loop.
 		      if (!_M_has_sol)
 			_M_dfs<__match_mode>(__state._M_alt);
 		    }
@@ -190,12 +226,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	    }
 	  break;
 	case _S_opcode_subexpr_begin:
-	  // Here's the critical part: if there's nothing changed since last
-	  // visit, do NOT continue. This prevents the executor from get into
-	  // infinite loop when use "()*" to match "".
-	  //
-	  // Every change on _M_cur_results will be roll back after the
-	  // recursion step finished.
+	  // If there's nothing changed since last visit, do NOT continue.
+	  // This prevents the executor from get into infinite loop when use
+	  // "()*" to match "".
 	  if (!_M_cur_results[__state._M_subexpr].matched
 	      || _M_cur_results[__state._M_subexpr].first != _M_current)
 	    {
@@ -232,8 +265,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  if (_M_word_boundry(__state) == !__state._M_neg)
 	    _M_dfs<__match_mode>(__state._M_next);
 	  break;
-	  // Here __state._M_alt offers a single start node for a sub-NFA.
-	  // We recursivly invoke our algorithm to match the sub-NFA.
+	// Here __state._M_alt offers a single start node for a sub-NFA.
+	// We recursively invoke our algorithm to match the sub-NFA.
 	case _S_opcode_subexpr_lookahead:
 	  if (_M_lookahead(__state) == !__state._M_neg)
 	    _M_dfs<__match_mode>(__state._M_next);

Follow-Ups:
- Re: [Patch] Regex comments
  - From: Jonathan Wakely

References:
- [Patch] Regex comments
  - From: Tim Shen
- Re: [Patch] Regex comments
  - From: Paolo Carlini

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]