[Patch 2/3] Make regex's infinite loop detection accurate
Tim Shen
timshen91@gmail.com
Fri Apr 25 22:04:00 GMT 2014
On Fri, Apr 25, 2014 at 4:40 PM, Tim Shen <timshen91@gmail.com> wrote:
> regex_match("", regex("(?:)*") will cause segfault because the
> infinite loop detection (writeen by me) is stupid. :O
Patch attached.
--
Regards,
Tim Shen
-------------- next part --------------
commit 71b52302793f501a60783ea6dec09202120ce592
Author: tim <timshen91@gmail.com>
Date: Fri Apr 25 01:50:11 2014 -0400
2014-04-25 Tim Shen <timshen91@gmail.com>
* include/bits/regex_executor.h: Add _M_rep_count to track how
many times this repeat node are visited.
* include/bits/regex_executor.tcc (_Executor<>::_M_rep_once_more,
_Executor<>::_M_dfs): Use _M_rep_count to prevent entering
infinite loop.
diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h
index 708c78e..d185c27 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -73,6 +73,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_M_results(__results),
_M_match_queue(__dfs_mode ? nullptr
: new vector<pair<_StateIdT, _ResultsVec>>()),
+ _M_rep_count(_M_nfa.size()),
_M_visited(__dfs_mode ? nullptr : new vector<bool>(_M_nfa.size())),
_M_flags((__flags & regex_constants::match_prev_avail)
? (__flags
@@ -104,6 +105,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
private:
template<bool __match_mode>
void
+ _M_rep_once_more(_StateIdT);
+
+ template<bool __match_mode>
+ void
_M_dfs(_StateIdT __start);
template<bool __match_mode>
@@ -151,6 +156,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// character.
std::unique_ptr<vector<pair<_StateIdT, _ResultsVec>>> _M_match_queue;
// Used in BFS, indicating that which state is already visited.
+ std::vector<pair<_BiIter, int>> _M_rep_count;
std::unique_ptr<vector<bool>> _M_visited;
_FlagT _M_flags;
// To record current solution.
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index 68a5e04..bb10a72 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -161,6 +161,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return false;
}
+ // __rep_count records how many times (__rep_count.second)
+ // this node are visited under certain input iterator
+ // (__rep_count.first). This prevent the executor from entering
+ // infinite loop by refusing to continue when it's already been
+ // visited more than twice. It's `twice` instead of `once` because
+ // we need to spare one more time for potential group capture.
+ template<typename _BiIter, typename _Alloc, typename _TraitsT,
+ bool __dfs_mode>
+ template<bool __match_mode>
+ void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
+ _M_rep_once_more(_StateIdT __i)
+ {
+ const auto& __state = _M_nfa[__i];
+ auto& __rep_count = _M_rep_count[__i];
+ if (__rep_count.second == 0 || __rep_count.first != _M_current)
+ {
+ auto back = __rep_count;
+ __rep_count.first = _M_current;
+ __rep_count.second = 1;
+ _M_dfs<__match_mode>(__state._M_alt);
+ __rep_count = back;
+ }
+ else
+ {
+ if (__rep_count.second < 2)
+ {
+ __rep_count.second++;
+ _M_dfs<__match_mode>(__state._M_alt);
+ __rep_count.second--;
+ }
+ }
+ };
+
// TODO: Use a function vector to dispatch, instead of using switch-case.
template<typename _BiIter, typename _Alloc, typename _TraitsT,
bool __dfs_mode>
@@ -185,68 +218,60 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// mean the same thing, and we need to choose the correct order under
// given greedy mode.
case _S_opcode_alternative:
- // Greedy.
- if (!__state._M_neg)
- {
- // "Once more" is preferred in greedy mode.
- _M_dfs<__match_mode>(__state._M_alt);
- // If it's DFS executor and already accepted, we're done.
- if (!__dfs_mode || !_M_has_sol)
- _M_dfs<__match_mode>(__state._M_next);
- }
- else // Non-greedy mode
- {
- if (__dfs_mode)
- {
- // vice-versa.
+ {
+ // Greedy.
+ if (!__state._M_neg)
+ {
+ _M_rep_once_more<__match_mode>(__i);
+ // If it's DFS executor and already accepted, we're done.
+ if (!__dfs_mode || !_M_has_sol)
_M_dfs<__match_mode>(__state._M_next);
- if (!_M_has_sol)
- _M_dfs<__match_mode>(__state._M_alt);
- }
- else
- {
- // DON'T attempt anything, because there's already another
- // state with higher priority accepted. This state cannot be
- // better by attempting its next node.
- if (!_M_has_sol)
- {
- _M_dfs<__match_mode>(__state._M_next);
- // DON'T attempt anything if it's already accepted. An
- // accepted state *must* be better than a solution that
- // matches a non-greedy quantifier one more time.
- if (!_M_has_sol)
- _M_dfs<__match_mode>(__state._M_alt);
- }
- }
+ }
+ else // Non-greedy mode
+ {
+ if (__dfs_mode)
+ {
+ // vice-versa.
+ _M_dfs<__match_mode>(__state._M_next);
+ if (!_M_has_sol)
+ _M_rep_once_more<__match_mode>(__i);
+ }
+ else
+ {
+ // DON'T attempt anything, because there's already another
+ // state with higher priority accepted. This state cannot be
+ // better by attempting its next node.
+ if (!_M_has_sol)
+ {
+ _M_dfs<__match_mode>(__state._M_next);
+ // DON'T attempt anything if it's already accepted. An
+ // accepted state *must* be better than a solution that
+ // matches a non-greedy quantifier one more time.
+ if (!_M_has_sol)
+ _M_rep_once_more<__match_mode>(__i);
+ }
+ }
+ }
}
break;
case _S_opcode_subexpr_begin:
- // If there's nothing changed since last visit, do NOT continue.
- // This prevents the executor from get into infinite loop when using
- // "()*" to match "".
- if (!_M_cur_results[__state._M_subexpr].matched
- || _M_cur_results[__state._M_subexpr].first != _M_current)
- {
- auto& __res = _M_cur_results[__state._M_subexpr];
- auto __back = __res.first;
- __res.first = _M_current;
- _M_dfs<__match_mode>(__state._M_next);
- __res.first = __back;
- }
+ {
+ auto& __res = _M_cur_results[__state._M_subexpr];
+ auto __back = __res.first;
+ __res.first = _M_current;
+ _M_dfs<__match_mode>(__state._M_next);
+ __res.first = __back;
+ }
break;
case _S_opcode_subexpr_end:
- if (_M_cur_results[__state._M_subexpr].second != _M_current
- || _M_cur_results[__state._M_subexpr].matched != true)
- {
- auto& __res = _M_cur_results[__state._M_subexpr];
- auto __back = __res;
- __res.second = _M_current;
- __res.matched = true;
- _M_dfs<__match_mode>(__state._M_next);
- __res = __back;
- }
- else
+ {
+ auto& __res = _M_cur_results[__state._M_subexpr];
+ auto __back = __res;
+ __res.second = _M_current;
+ __res.matched = true;
_M_dfs<__match_mode>(__state._M_next);
+ __res = __back;
+ }
break;
case _S_opcode_line_begin_assertion:
if (_M_at_begin())
diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
index 3fdbdad..c33d1b6 100644
--- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
+++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
@@ -50,6 +50,7 @@ test01()
const char s[] = "";
VERIFY( regex_match_debug(s, m, re) );
}
+ VERIFY(regex_match_debug("", regex("(?:)*")));
}
int
More information about the Gcc-patches
mailing list