[Patch 2/3] Make regex's infinite loop detection accurate

Tim Shen timshen91@gmail.com
Fri Apr 25 22:04:00 GMT 2014


On Fri, Apr 25, 2014 at 4:40 PM, Tim Shen <timshen91@gmail.com> wrote:
> regex_match("", regex("(?:)*") will cause segfault because the
> infinite loop detection (writeen by me) is stupid. :O

Patch attached.


-- 
Regards,
Tim Shen
-------------- next part --------------
commit 71b52302793f501a60783ea6dec09202120ce592
Author: tim <timshen91@gmail.com>
Date:   Fri Apr 25 01:50:11 2014 -0400

    2014-04-25  Tim Shen  <timshen91@gmail.com>
    
    	* include/bits/regex_executor.h: Add _M_rep_count to track how
    	many times this repeat node are visited.
    	* include/bits/regex_executor.tcc (_Executor<>::_M_rep_once_more,
    	_Executor<>::_M_dfs): Use _M_rep_count to prevent entering
    	infinite loop.

diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h
index 708c78e..d185c27 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -73,6 +73,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _M_results(__results),
       _M_match_queue(__dfs_mode ? nullptr
 		     : new vector<pair<_StateIdT, _ResultsVec>>()),
+      _M_rep_count(_M_nfa.size()),
       _M_visited(__dfs_mode ? nullptr : new vector<bool>(_M_nfa.size())),
       _M_flags((__flags & regex_constants::match_prev_avail)
 	       ? (__flags
@@ -104,6 +105,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     private:
       template<bool __match_mode>
 	void
+	_M_rep_once_more(_StateIdT);
+
+      template<bool __match_mode>
+	void
 	_M_dfs(_StateIdT __start);
 
       template<bool __match_mode>
@@ -151,6 +156,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       // character.
       std::unique_ptr<vector<pair<_StateIdT, _ResultsVec>>> _M_match_queue;
       // Used in BFS, indicating that which state is already visited.
+      std::vector<pair<_BiIter, int>>                       _M_rep_count;
       std::unique_ptr<vector<bool>>                         _M_visited;
       _FlagT                                                _M_flags;
       // To record current solution.
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index 68a5e04..bb10a72 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -161,6 +161,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       return false;
     }
 
+  // __rep_count records how many times (__rep_count.second)
+  // this node are visited under certain input iterator
+  // (__rep_count.first). This prevent the executor from entering
+  // infinite loop by refusing to continue when it's already been
+  // visited more than twice. It's `twice` instead of `once` because
+  // we need to spare one more time for potential group capture.
+  template<typename _BiIter, typename _Alloc, typename _TraitsT,
+    bool __dfs_mode>
+  template<bool __match_mode>
+    void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
+    _M_rep_once_more(_StateIdT __i)
+    {
+      const auto& __state = _M_nfa[__i];
+      auto& __rep_count = _M_rep_count[__i];
+      if (__rep_count.second == 0 || __rep_count.first != _M_current)
+	{
+	  auto back = __rep_count;
+	  __rep_count.first = _M_current;
+	  __rep_count.second = 1;
+	  _M_dfs<__match_mode>(__state._M_alt);
+	  __rep_count = back;
+	}
+      else
+	{
+	  if (__rep_count.second < 2)
+	    {
+	      __rep_count.second++;
+	      _M_dfs<__match_mode>(__state._M_alt);
+	      __rep_count.second--;
+	    }
+	}
+    };
+
   // TODO: Use a function vector to dispatch, instead of using switch-case.
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
     bool __dfs_mode>
@@ -185,68 +218,60 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	// mean the same thing, and we need to choose the correct order under
 	// given greedy mode.
 	case _S_opcode_alternative:
-	  // Greedy.
-	  if (!__state._M_neg)
-	    {
-	      // "Once more" is preferred in greedy mode.
-	      _M_dfs<__match_mode>(__state._M_alt);
-	      // If it's DFS executor and already accepted, we're done.
-	      if (!__dfs_mode || !_M_has_sol)
-		_M_dfs<__match_mode>(__state._M_next);
-	    }
-	  else // Non-greedy mode
-	    {
-	      if (__dfs_mode)
-		{
-		  // vice-versa.
+	  {
+	    // Greedy.
+	    if (!__state._M_neg)
+	      {
+		_M_rep_once_more<__match_mode>(__i);
+		// If it's DFS executor and already accepted, we're done.
+		if (!__dfs_mode || !_M_has_sol)
 		  _M_dfs<__match_mode>(__state._M_next);
-		  if (!_M_has_sol)
-		    _M_dfs<__match_mode>(__state._M_alt);
-		}
-	      else
-		{
-		  // DON'T attempt anything, because there's already another
-		  // state with higher priority accepted. This state cannot be
-		  // better by attempting its next node.
-		  if (!_M_has_sol)
-		    {
-		      _M_dfs<__match_mode>(__state._M_next);
-		      // DON'T attempt anything if it's already accepted. An
-		      // accepted state *must* be better than a solution that
-		      // matches a non-greedy quantifier one more time.
-		      if (!_M_has_sol)
-			_M_dfs<__match_mode>(__state._M_alt);
-		    }
-		}
+	      }
+	    else // Non-greedy mode
+	      {
+		if (__dfs_mode)
+		  {
+		    // vice-versa.
+		    _M_dfs<__match_mode>(__state._M_next);
+		    if (!_M_has_sol)
+		      _M_rep_once_more<__match_mode>(__i);
+		  }
+		else
+		  {
+		    // DON'T attempt anything, because there's already another
+		    // state with higher priority accepted. This state cannot be
+		    // better by attempting its next node.
+		    if (!_M_has_sol)
+		      {
+			_M_dfs<__match_mode>(__state._M_next);
+			// DON'T attempt anything if it's already accepted. An
+			// accepted state *must* be better than a solution that
+			// matches a non-greedy quantifier one more time.
+			if (!_M_has_sol)
+			  _M_rep_once_more<__match_mode>(__i);
+		      }
+		  }
+	      }
 	    }
 	  break;
 	case _S_opcode_subexpr_begin:
-	  // If there's nothing changed since last visit, do NOT continue.
-	  // This prevents the executor from get into infinite loop when using
-	  // "()*" to match "".
-	  if (!_M_cur_results[__state._M_subexpr].matched
-	      || _M_cur_results[__state._M_subexpr].first != _M_current)
-	    {
-	      auto& __res = _M_cur_results[__state._M_subexpr];
-	      auto __back = __res.first;
-	      __res.first = _M_current;
-	      _M_dfs<__match_mode>(__state._M_next);
-	      __res.first = __back;
-	    }
+	  {
+	    auto& __res = _M_cur_results[__state._M_subexpr];
+	    auto __back = __res.first;
+	    __res.first = _M_current;
+	    _M_dfs<__match_mode>(__state._M_next);
+	    __res.first = __back;
+	  }
 	  break;
 	case _S_opcode_subexpr_end:
-	  if (_M_cur_results[__state._M_subexpr].second != _M_current
-	      || _M_cur_results[__state._M_subexpr].matched != true)
-	    {
-	      auto& __res = _M_cur_results[__state._M_subexpr];
-	      auto __back = __res;
-	      __res.second = _M_current;
-	      __res.matched = true;
-	      _M_dfs<__match_mode>(__state._M_next);
-	      __res = __back;
-	    }
-	  else
+	  {
+	    auto& __res = _M_cur_results[__state._M_subexpr];
+	    auto __back = __res;
+	    __res.second = _M_current;
+	    __res.matched = true;
 	    _M_dfs<__match_mode>(__state._M_next);
+	    __res = __back;
+	  }
 	  break;
 	case _S_opcode_line_begin_assertion:
 	  if (_M_at_begin())
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
index 3fdbdad..c33d1b6 100644
--- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
+++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
@@ -50,6 +50,7 @@ test01()
     const char s[] = "";
     VERIFY( regex_match_debug(s, m, re) );
   }
+  VERIFY(regex_match_debug("", regex("(?:)*")));
 }
 
 int


More information about the Gcc-patches mailing list