This is the mail archive of the libstdc++@gcc.gnu.org mailing list for the libstdc++ project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PR 61424] std::regex matches right to left, not leftmost longest

From: Tim Shen <timshen91 at gmail dot com>
To: "libstdc++" <libstdc++ at gcc dot gnu dot org>, gcc-patches <gcc-patches at gcc dot gnu dot org>, Jonathan Wakely <jwakely at redhat dot com>
Date: Fri, 6 Jun 2014 21:57:15 -0700
Subject: [PR 61424] std::regex matches right to left, not leftmost longest
Authentication-results: sourceware.org; auth=none

Here's a patch that is a little bit "quick & dirty". In BFS mode it's
not trivial to support ECMAScript's
"match from left to right". A quick solution is only using DFS for
ECMAScript. See the patch.

It is possible to support ECMAScript's "left to right" in BFS mode
(re2 does this), but I didn't see an obvious way. We could take a look
at re2.

Bootstrap & tested.

Thank you!


-- 
Regards,
Tim Shen

commit 1fef2d9d80f7dcda227c1bbaf909128e068f6907
Author: tim <timshen91@gmail.com>
Date:   Fri Jun 6 00:00:25 2014 -0700

    	PR libstdc++/61424
    	* include/bits/regex.tcc (__regex_algo_impl<>): Use DFS for ECMAScript,
    	not just regex containing back-references.
    	* include/bits/regex_automaton.h (_NFA_base<>::_NFA_base):
    	Remove _M_has_backref tracking.
    	* include/bits/regex_automaton.tcc (_NFA<>::_M_insert_backref):
    	Likewise.
    	* include/bits/regex_compiler.tcc (_Compiler<>::_M_disjunction):
    	exchange _M_next and _M_alt for alternative operator,
    	making matching from left to right.
    	* include/bits/regex_executor.h (_State_info<>::_M_get_sol_pos):
    	Add position tracking fom DFS.
    	* include/bits/regex_executor.tcc (_Executor<>::_M_main_dispatch,
    	_Executor<>::_M_dfs): Likewise.
    	* include/bits/regex_scanner.h: Remove unused enum entry.
    	* testsuite/28_regex/algorithms/regex_search/61424.cc: New
    	testcase from PR.

diff --git a/libstdc++-v3/include/bits/regex.tcc b/libstdc++-v3/include/bits/regex.tcc
index a81f517..6a1faaf 100644
--- a/libstdc++-v3/include/bits/regex.tcc
+++ b/libstdc++-v3/include/bits/regex.tcc
@@ -70,7 +70,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       // without defining a macro. Users should define
       // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach.
       bool __ret;
-      if (!__re._M_automaton->_M_has_backref
+      if (!(__re._M_flags & regex_constants::ECMAScript)
 #ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA
 	  && __policy == _RegexExecutorPolicy::_S_alternate
 #endif
diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h
index 1b0da14..a45ce1a 100644
--- a/libstdc++-v3/include/bits/regex_automaton.h
+++ b/libstdc++-v3/include/bits/regex_automaton.h
@@ -119,8 +119,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
     explicit
     _NFA_base(_FlagT __f)
-    : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
-    _M_has_backref(false)
+    : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0)
     { }
 
     _NFA_base(_NFA_base&&) = default;
@@ -145,7 +144,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     _FlagT                    _M_flags;
     _StateIdT                 _M_start_state;
     _SizeT                    _M_subexpr_count;
-    bool                      _M_has_backref;
   };
 
   template<typename _TraitsT>
diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc
index 1c6cfdd..78c83ec 100644
--- a/libstdc++-v3/include/bits/regex_automaton.tcc
+++ b/libstdc++-v3/include/bits/regex_automaton.tcc
@@ -160,7 +160,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       for (auto __it : this->_M_paren_stack)
 	if (__index == __it)
 	  __throw_regex_error(regex_constants::error_backref);
-      this->_M_has_backref = true;
       _StateT __tmp(_S_opcode_backref);
       __tmp._M_backref_index = __index;
       return _M_insert_state(std::move(__tmp));
diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc
index 0df10cc..f15f7dd 100644
--- a/libstdc++-v3/include/bits/regex_compiler.tcc
+++ b/libstdc++-v3/include/bits/regex_compiler.tcc
@@ -103,9 +103,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  auto __end = _M_nfa._M_insert_dummy();
 	  __alt1._M_append(__end);
 	  __alt2._M_append(__end);
+	  // __alt2 is state._M_next, __alt1 is state._M_alt. The executor
+	  // executes _M_alt before _M_next, as well as executing left
+	  // alternative before right one.
 	  _M_stack.push(_StateSeqT(_M_nfa,
-				   _M_nfa._M_insert_alt(__alt1._M_start,
-							__alt2._M_start, false),
+				   _M_nfa._M_insert_alt(__alt2._M_start,
+							__alt1._M_start, false),
 				   __end));
 	}
     }
diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h
index 1991c00..e02fa65 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -173,6 +173,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  void _M_queue(_StateIdT __i, const _ResultsVec& __res)
 	  { _M_match_queue.emplace_back(__i, __res); }
 
+	  _BiIter* _M_get_sol_pos() { return nullptr; }
+
 	  // Saves states that need to be considered for the next character.
 	  vector<pair<_StateIdT, _ResultsVec>>	_M_match_queue;
 	  // Indicates which states are already visited.
@@ -191,12 +193,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	  // Dummy implementations for DFS mode.
 	  bool _M_visited(_StateIdT) const { return false; }
 	  void _M_queue(_StateIdT, const _ResultsVec&) { }
+	  _BiIter* _M_get_sol_pos() { return &_M_sol_pos; }
 
 	  // To record current solution.
 	  _StateIdT _M_start;
+	  _BiIter   _M_sol_pos;
 	};
 
-
     public:
       _ResultsVec                                           _M_cur_results;
       _BiIter                                               _M_current;
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index aefa8f4..38b8ff2 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -82,6 +82,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     _M_main_dispatch(_Match_mode __match_mode, __dfs)
     {
       _M_has_sol = false;
+      *_M_states._M_get_sol_pos() = _BiIter();
       _M_cur_results = _M_results;
       _M_dfs(__match_mode, _M_states._M_start);
       return _M_has_sol;
@@ -338,7 +339,29 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 		  && (_M_flags & regex_constants::match_not_null))
 		_M_has_sol = false;
 	      if (_M_has_sol)
-		_M_results = _M_cur_results;
+		{
+		  if (_M_nfa._M_flags & regex_constants::ECMAScript)
+		    _M_results = _M_cur_results;
+		  else // POSIX
+		    {
+		      _GLIBCXX_DEBUG_ASSERT(_M_states._M_get_sol_pos());
+		      // Here's POSIX's logic: match the longest one. However
+		      // we never know which one (lhs or rhs of "|") is longer
+		      // unless we try both of them and compare the results.
+		      // The member variable _M_sol_pos records the end
+		      // position of the last successful match. It's better
+		      // to be larger, because POSIX regex is always greedy.
+		      // TODO: This could be slow.
+		      if (*_M_states._M_get_sol_pos() == _BiIter()
+			  || std::distance(_M_begin,
+					   *_M_states._M_get_sol_pos())
+			     < std::distance(_M_begin, _M_current))
+			{
+			  *_M_states._M_get_sol_pos() = _M_current;
+			  _M_results = _M_cur_results;
+			}
+		    }
+		}
 	    }
 	  else
 	    {
@@ -354,9 +377,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	    }
 	  break;
 	case _S_opcode_alternative:
-	  _M_dfs(__match_mode, __state._M_alt);
-	  if (!__dfs_mode || !_M_has_sol)
-	    _M_dfs(__match_mode, __state._M_next);
+	  if (_M_nfa._M_flags & regex_constants::ECMAScript)
+	    {
+	      // TODO: Let DFS support ECMAScript's alternative operation.
+	      _GLIBCXX_DEBUG_ASSERT(!__dfs_mode);
+	      _M_dfs(__match_mode, __state._M_alt);
+	      // Pick lhs if it matches. Only try rhs if it doesn't.
+	      if (!_M_has_sol)
+		_M_dfs(__match_mode, __state._M_next);
+	    }
+	  else
+	    {
+	      // Try both and compare the result.
+	      // See "case _S_opcode_accept:" handling above.
+	      _M_dfs(__match_mode, __state._M_alt);
+	      auto __has_sol = _M_has_sol;
+	      _M_has_sol = false;
+	      _M_dfs(__match_mode, __state._M_next);
+	      _M_has_sol |= __has_sol;
+	    }
 	  break;
 	default:
 	  _GLIBCXX_DEBUG_ASSERT(false);
diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h
index 6627db9..5552226 100644
--- a/libstdc++-v3/include/bits/regex_scanner.h
+++ b/libstdc++-v3/include/bits/regex_scanner.h
@@ -67,7 +67,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _S_token_or,
       _S_token_closure0,
       _S_token_closure1,
-      _S_token_ungreedy,
       _S_token_line_begin,
       _S_token_line_end,
       _S_token_word_bound, // neg if _M_value[0] == 'n'
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc
new file mode 100644
index 0000000..bdccb4a
--- /dev/null
+++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc
@@ -0,0 +1,52 @@
+// { dg-options "-std=gnu++11" }
+
+// Copyright (C) 2014 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// PR libstdc++/61424
+
+#include <regex>
+#include <testsuite_hooks.h>
+#include <testsuite_regex.h>
+
+using namespace std;
+using namespace __gnu_test;
+
+int main()
+{
+  regex_constants::syntax_option_type grammar[] = {
+    regex_constants::ECMAScript, regex_constants::extended,
+    regex_constants::awk, regex_constants::egrep
+  };
+
+  string sol[] = {
+      "tour",
+      "tournament",
+      "tournament",
+      "tournament",
+  };
+  int i = 0;
+  for (auto g : grammar)
+  {
+    regex re("tour|tournament|tourn", g);
+    const char str[] = "tournament";
+    cmatch m;
+    VERIFY(regex_search_debug(str, m, re));
+    VERIFY(sol[i] == m[0]);
+    i++;
+  }
+}

Follow-Ups:
- Re: [PR 61424] std::regex matches right to left, not leftmost longest
  - From: Jonathan Wakely

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]