From a670a9bb5477d63b47091347da53551d639bb8ee Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Sun, 27 Apr 2014 23:48:47 +0000 Subject: [PATCH] regex_automaton.h (_NFA<>::_M_insert_repeat): Add _S_opcode_repeat support to distingush a loop from _S_opcode_alternative. 2014-04-27 Tim Shen * include/bits/regex_automaton.h (_NFA<>::_M_insert_repeat): Add _S_opcode_repeat support to distingush a loop from _S_opcode_alternative. * include/bits/regex_automaton.tcc (_State_base::_M_print, _State_base::_M_dot, _NFA<>::_M_eliminate_dummy, _StateSeq<>::_M_clone): Likewise. * include/bits/regex_compiler.tcc (_Compiler<>::_M_quantifier): Likewise. * include/bits/regex_executor.tcc (_Executor<>::_M_dfs): Likewise. * include/bits/regex_scanner.tcc (_Scanner<>::_M_eat_escape_ecma): Uglify local variable __i. * include/bits/regex_compiler.h (_BracketMatcher<>::_M_make_cache): Use size_t instead of int to compare with vector::size(). 2014-04-27 Tim Shen * include/bits/regex_executor.h: Add _M_rep_count to track how many times this repeat node are visited. * include/bits/regex_executor.tcc (_Executor<>::_M_rep_once_more, _Executor<>::_M_dfs): Use _M_rep_count to prevent entering infinite loop. 2014-04-27 Tim Shen * include/bits/regex.tcc (__regex_algo_impl<>): Remove _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT and use _GLIBCXX_REGEX_USE_THOMPSON_NFA instead. * include/bits/regex_automaton.h: Remove quantifier counting variable. * include/bits/regex_automaton.tcc (_State_base::_M_dot): Adjust debug NFA dump. From-SVN: r209844 --- libstdc++-v3/ChangeLog | 33 ++++ libstdc++-v3/include/bits/regex.tcc | 35 ++--- libstdc++-v3/include/bits/regex_automaton.h | 21 ++- libstdc++-v3/include/bits/regex_automaton.tcc | 9 +- libstdc++-v3/include/bits/regex_compiler.h | 2 +- libstdc++-v3/include/bits/regex_compiler.tcc | 20 +-- libstdc++-v3/include/bits/regex_executor.h | 10 +- libstdc++-v3/include/bits/regex_executor.tcc | 143 +++++++++++------- libstdc++-v3/include/bits/regex_scanner.tcc | 2 +- .../regex_match/ecma/char/emptygroup.cc | 1 + 10 files changed, 175 insertions(+), 101 deletions(-) diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 2a2c06861956..6cc371c5d9d1 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,36 @@ +2014-04-27 Tim Shen + + * include/bits/regex_automaton.h (_NFA<>::_M_insert_repeat): + Add _S_opcode_repeat support to distingush a loop from + _S_opcode_alternative. + * include/bits/regex_automaton.tcc (_State_base::_M_print, + _State_base::_M_dot, _NFA<>::_M_eliminate_dummy, + _StateSeq<>::_M_clone): Likewise. + * include/bits/regex_compiler.tcc (_Compiler<>::_M_quantifier): + Likewise. + * include/bits/regex_executor.tcc (_Executor<>::_M_dfs): Likewise. + * include/bits/regex_scanner.tcc (_Scanner<>::_M_eat_escape_ecma): + Uglify local variable __i. + * include/bits/regex_compiler.h (_BracketMatcher<>::_M_make_cache): + Use size_t instead of int to compare with vector::size(). + +2014-04-27 Tim Shen + + * include/bits/regex_executor.h: Add _M_rep_count to track how + many times this repeat node are visited. + * include/bits/regex_executor.tcc (_Executor<>::_M_rep_once_more, + _Executor<>::_M_dfs): Use _M_rep_count to prevent entering + infinite loop. + +2014-04-27 Tim Shen + + * include/bits/regex.tcc (__regex_algo_impl<>): Remove + _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT and use + _GLIBCXX_REGEX_USE_THOMPSON_NFA instead. + * include/bits/regex_automaton.h: Remove quantifier counting variable. + * include/bits/regex_automaton.tcc (_State_base::_M_dot): + Adjust debug NFA dump. + 2014-04-25 Lars Gullik Bjønnes PR libstdc++/60710 diff --git a/libstdc++-v3/include/bits/regex.tcc b/libstdc++-v3/include/bits/regex.tcc index 5fa1f018b3dc..0d737a0b74b3 100644 --- a/libstdc++-v3/include/bits/regex.tcc +++ b/libstdc++-v3/include/bits/regex.tcc @@ -28,12 +28,12 @@ * Do not attempt to use it directly. @headername{regex} */ -// See below __regex_algo_impl to get what this is talking about. The default -// value 1 indicated a conservative optimization without giving up worst case -// performance. -#ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT -#define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1 -#endif +// A non-standard switch to let the user pick the matching algorithm. +// If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA +// algorithm will be used. This algorithm is not enabled by default, +// and cannot be used if the regex contains back-references, but has better +// (polynomial instead of exponential) worst case performace. +// See __regex_algo_impl below. namespace std _GLIBCXX_VISIBILITY(default) { @@ -66,24 +66,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION for (auto& __it : __res) __it.matched = false; - // This function decide which executor to use under given circumstances. - // The _S_auto policy now is the following: if a NFA has no - // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT - // quantifiers (*, +, ?), the BFS executor will be used, other wise - // DFS executor. This is because DFS executor has a exponential upper - // bound, but better best-case performace. Meanwhile, BFS executor can - // effectively prevent from exponential-long time matching (which must - // contains many quantifiers), but it's slower in average. - // - // For simple regex, BFS executor could be 2 or more times slower than - // DFS executor. - // - // Of course, BFS executor cannot handle back-references. + // __policy is used by testsuites so that they can use Thompson NFA + // without defining a macro. Users should define + // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach. bool __ret; if (!__re._M_automaton->_M_has_backref - && (__policy == _RegexExecutorPolicy::_S_alternate - || __re._M_automaton->_M_quant_count - > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT)) +#ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA + && __policy == _RegexExecutorPolicy::_S_alternate +#endif + ) { _Executor<_BiIter, _Alloc, _TraitsT, false> __executor(__s, __e, __m, __re, __flags); diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h index a442cfe21b7f..1b0da1453e95 100644 --- a/libstdc++-v3/include/bits/regex_automaton.h +++ b/libstdc++-v3/include/bits/regex_automaton.h @@ -52,6 +52,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { _S_opcode_unknown, _S_opcode_alternative, + _S_opcode_repeat, _S_opcode_backref, _S_opcode_line_begin_assertion, _S_opcode_line_end_assertion, @@ -74,9 +75,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION size_t _M_backref_index; // for _S_opcode_backref struct { - // for _S_opcode_alternative. - _StateIdT _M_quant_index; - // for _S_opcode_alternative or _S_opcode_subexpr_lookahead + // for _S_opcode_alternative, _S_opcode_repeat and + // _S_opcode_subexpr_lookahead _StateIdT _M_alt; // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or // quantifiers (ungreedy if set true) @@ -120,7 +120,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION explicit _NFA_base(_FlagT __f) : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0), - _M_quant_count(0), _M_has_backref(false) + _M_has_backref(false) { } _NFA_base(_NFA_base&&) = default; @@ -145,7 +145,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _FlagT _M_flags; _StateIdT _M_start_state; _SizeT _M_subexpr_count; - _SizeT _M_quant_count; bool _M_has_backref; }; @@ -175,7 +174,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _StateT __tmp(_S_opcode_alternative); // It labels every quantifier to make greedy comparison easier in BFS // approach. - __tmp._M_quant_index = this->_M_quant_count++; + __tmp._M_next = __next; + __tmp._M_alt = __alt; + return _M_insert_state(std::move(__tmp)); + } + + _StateIdT + _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg) + { + _StateT __tmp(_S_opcode_repeat); + // It labels every quantifier to make greedy comparison easier in BFS + // approach. __tmp._M_next = __next; __tmp._M_alt = __alt; __tmp._M_neg = __neg; diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc index 1476ae20a5b9..e0ac3f94ceb7 100644 --- a/libstdc++-v3/include/bits/regex_automaton.tcc +++ b/libstdc++-v3/include/bits/regex_automaton.tcc @@ -41,6 +41,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION switch (_M_opcode) { case _S_opcode_alternative: + case _S_opcode_repeat: ostr << "alt next=" << _M_next << " alt=" << _M_alt; break; case _S_opcode_subexpr_begin: @@ -72,11 +73,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION switch (_M_opcode) { case _S_opcode_alternative: + case _S_opcode_repeat: __ostr << __id << " [label=\"" << __id << "\\nALT\"];\n" << __id << " -> " << _M_next - << " [label=\"epsilon\", tailport=\"s\"];\n" + << " [label=\"next\", tailport=\"s\"];\n" << __id << " -> " << _M_alt - << " [label=\"epsilon\", tailport=\"n\"];\n"; + << " [label=\"alt\", tailport=\"n\"];\n"; break; case _S_opcode_backref: __ostr << __id << " [label=\"" << __id << "\\nBACKREF " @@ -174,6 +176,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION == _S_opcode_dummy) __it._M_next = (*this)[__it._M_next]._M_next; if (__it._M_opcode == _S_opcode_alternative + || __it._M_opcode == _S_opcode_repeat || __it._M_opcode == _S_opcode_subexpr_lookahead) while (__it._M_alt >= 0 && (*this)[__it._M_alt]._M_opcode == _S_opcode_dummy) @@ -198,6 +201,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __id = _M_nfa._M_insert_state(__dup); __m[__u] = __id; if (__dup._M_opcode == _S_opcode_alternative + || __dup._M_opcode == _S_opcode_repeat || __dup._M_opcode == _S_opcode_subexpr_lookahead) if (__dup._M_alt != _S_invalid_state_id && __m[__dup._M_alt] == -1) __stack.push(__dup._M_alt); @@ -217,6 +221,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __ref._M_next = __m[__ref._M_next]; } if (__ref._M_opcode == _S_opcode_alternative + || __ref._M_opcode == _S_opcode_repeat || __ref._M_opcode == _S_opcode_subexpr_lookahead) if (__ref._M_alt != _S_invalid_state_id) { diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index f5a198f65e92..d7e21624e370 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -421,7 +421,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _M_make_cache(true_type) { - for (int __i = 0; __i < _M_cache.size(); __i++) + for (size_t __i = 0; __i < _M_cache.size(); __i++) _M_cache[static_cast<_UnsignedCharT>(__i)] = _M_apply(__i, false_type()); } diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 128dac12bd72..3cf9e457ccd7 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -188,8 +188,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { __init(); auto __e = _M_pop(); - _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, - __e._M_start, __neg)); + _StateSeqT __r(_M_nfa, _M_nfa._M_insert_repeat(_S_invalid_state_id, + __e._M_start, __neg)); __e._M_append(__r); _M_stack.push(__r); } @@ -197,8 +197,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { __init(); auto __e = _M_pop(); - __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start, - __neg)); + __e._M_append(_M_nfa._M_insert_repeat(_S_invalid_state_id, + __e._M_start, __neg)); _M_stack.push(__e); } else if (_M_match_token(_ScannerT::_S_token_opt)) @@ -206,8 +206,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __init(); auto __e = _M_pop(); auto __end = _M_nfa._M_insert_dummy(); - _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, - __e._M_start, __neg)); + _StateSeqT __r(_M_nfa, _M_nfa._M_insert_repeat(_S_invalid_state_id, + __e._M_start, __neg)); __e._M_append(__end); __r._M_append(__end); _M_stack.push(__r); @@ -244,8 +244,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { auto __tmp = __r._M_clone(); _StateSeqT __s(_M_nfa, - _M_nfa._M_insert_alt(_S_invalid_state_id, - __tmp._M_start, __neg)); + _M_nfa._M_insert_repeat(_S_invalid_state_id, + __tmp._M_start, __neg)); __tmp._M_append(__s); __e._M_append(__s); } @@ -261,8 +261,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION for (long __i = 0; __i < __n; ++__i) { auto __tmp = __r._M_clone(); - auto __alt = _M_nfa._M_insert_alt(__tmp._M_start, - __end, __neg); + auto __alt = _M_nfa._M_insert_repeat(__tmp._M_start, + __end, __neg); __stack.push(__alt); __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end)); } diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 708c78e2081c..c110b88a3f49 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -73,6 +73,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_results(__results), _M_match_queue(__dfs_mode ? nullptr : new vector>()), + _M_rep_count(_M_nfa.size()), _M_visited(__dfs_mode ? nullptr : new vector(_M_nfa.size())), _M_flags((__flags & regex_constants::match_prev_avail) ? (__flags @@ -102,6 +103,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_search(); private: + template + void + _M_rep_once_more(_StateIdT); + template void _M_dfs(_StateIdT __start); @@ -149,9 +154,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _ResultsVec& _M_results; // Used in BFS, saving states that need to be considered for the next // character. - std::unique_ptr>> _M_match_queue; + unique_ptr>> _M_match_queue; // Used in BFS, indicating that which state is already visited. - std::unique_ptr> _M_visited; + vector> _M_rep_count; + unique_ptr> _M_visited; _FlagT _M_flags; // To record current solution. _StateIdT _M_start_state; diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index 68a5e0490f95..7f8993382c86 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -161,7 +161,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } - // TODO: Use a function vector to dispatch, instead of using switch-case. + // __rep_count records how many times (__rep_count.second) + // this node is visited under certain input iterator + // (__rep_count.first). This prevent the executor from entering + // infinite loop by refusing to continue when it's already been + // visited more than twice. It's `twice` instead of `once` because + // we need to spare one more time for potential group capture. + template + template + void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_rep_once_more(_StateIdT __i) + { + const auto& __state = _M_nfa[__i]; + auto& __rep_count = _M_rep_count[__i]; + if (__rep_count.second == 0 || __rep_count.first != _M_current) + { + auto __back = __rep_count; + __rep_count.first = _M_current; + __rep_count.second = 1; + _M_dfs<__match_mode>(__state._M_alt); + __rep_count = __back; + } + else + { + if (__rep_count.second < 2) + { + __rep_count.second++; + _M_dfs<__match_mode>(__state._M_alt); + __rep_count.second--; + } + } + }; + template template @@ -184,69 +216,61 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // of this quantifier". Executing _M_next first or _M_alt first don't // mean the same thing, and we need to choose the correct order under // given greedy mode. - case _S_opcode_alternative: - // Greedy. - if (!__state._M_neg) - { - // "Once more" is preferred in greedy mode. - _M_dfs<__match_mode>(__state._M_alt); - // If it's DFS executor and already accepted, we're done. - if (!__dfs_mode || !_M_has_sol) - _M_dfs<__match_mode>(__state._M_next); - } - else // Non-greedy mode - { - if (__dfs_mode) - { - // vice-versa. + case _S_opcode_repeat: + { + // Greedy. + if (!__state._M_neg) + { + _M_rep_once_more<__match_mode>(__i); + // If it's DFS executor and already accepted, we're done. + if (!__dfs_mode || !_M_has_sol) _M_dfs<__match_mode>(__state._M_next); - if (!_M_has_sol) - _M_dfs<__match_mode>(__state._M_alt); - } - else - { - // DON'T attempt anything, because there's already another - // state with higher priority accepted. This state cannot be - // better by attempting its next node. - if (!_M_has_sol) - { - _M_dfs<__match_mode>(__state._M_next); - // DON'T attempt anything if it's already accepted. An - // accepted state *must* be better than a solution that - // matches a non-greedy quantifier one more time. - if (!_M_has_sol) - _M_dfs<__match_mode>(__state._M_alt); - } - } + } + else // Non-greedy mode + { + if (__dfs_mode) + { + // vice-versa. + _M_dfs<__match_mode>(__state._M_next); + if (!_M_has_sol) + _M_rep_once_more<__match_mode>(__i); + } + else + { + // DON'T attempt anything, because there's already another + // state with higher priority accepted. This state cannot be + // better by attempting its next node. + if (!_M_has_sol) + { + _M_dfs<__match_mode>(__state._M_next); + // DON'T attempt anything if it's already accepted. An + // accepted state *must* be better than a solution that + // matches a non-greedy quantifier one more time. + if (!_M_has_sol) + _M_rep_once_more<__match_mode>(__i); + } + } + } } break; case _S_opcode_subexpr_begin: - // If there's nothing changed since last visit, do NOT continue. - // This prevents the executor from get into infinite loop when using - // "()*" to match "". - if (!_M_cur_results[__state._M_subexpr].matched - || _M_cur_results[__state._M_subexpr].first != _M_current) - { - auto& __res = _M_cur_results[__state._M_subexpr]; - auto __back = __res.first; - __res.first = _M_current; - _M_dfs<__match_mode>(__state._M_next); - __res.first = __back; - } + { + auto& __res = _M_cur_results[__state._M_subexpr]; + auto __back = __res.first; + __res.first = _M_current; + _M_dfs<__match_mode>(__state._M_next); + __res.first = __back; + } break; case _S_opcode_subexpr_end: - if (_M_cur_results[__state._M_subexpr].second != _M_current - || _M_cur_results[__state._M_subexpr].matched != true) - { - auto& __res = _M_cur_results[__state._M_subexpr]; - auto __back = __res; - __res.second = _M_current; - __res.matched = true; - _M_dfs<__match_mode>(__state._M_next); - __res = __back; - } - else + { + auto& __res = _M_cur_results[__state._M_subexpr]; + auto __back = __res; + __res.second = _M_current; + __res.matched = true; _M_dfs<__match_mode>(__state._M_next); + __res = __back; + } break; case _S_opcode_line_begin_assertion: if (_M_at_begin()) @@ -339,6 +363,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } } break; + case _S_opcode_alternative: + _M_dfs<__match_mode>(__state._M_alt); + if (!__dfs_mode || !_M_has_sol) + _M_dfs<__match_mode>(__state._M_next); + break; default: _GLIBCXX_DEBUG_ASSERT(false); } diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc index 5332d2eff84f..f501ff6c2cba 100644 --- a/libstdc++-v3/include/bits/regex_scanner.tcc +++ b/libstdc++-v3/include/bits/regex_scanner.tcc @@ -335,7 +335,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION else if (__c == 'x' || __c == 'u') { _M_value.erase(); - for (int i = 0; i < (__c == 'x' ? 2 : 4); i++) + for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++) { if (_M_current == _M_end || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc index 3fdbdadedbe5..c33d1b618940 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc @@ -50,6 +50,7 @@ test01() const char s[] = ""; VERIFY( regex_match_debug(s, m, re) ); } + VERIFY(regex_match_debug("", regex("(?:)*"))); } int -- 2.43.5