libstdc++
regex_nfa.h
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2010, 2011 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_nfa.h
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 namespace std _GLIBCXX_VISIBILITY(default)
32 {
33 namespace __regex
34 {
35 _GLIBCXX_BEGIN_NAMESPACE_VERSION
36 
37  // Base class for, um, automata. Could be an NFA or a DFA. Your choice.
38  class _Automaton
39  {
40  public:
41  typedef unsigned int _SizeT;
42 
43  public:
44  virtual
45  ~_Automaton() { }
46 
47  virtual _SizeT
48  _M_sub_count() const = 0;
49 
50 #ifdef _GLIBCXX_DEBUG
51  virtual std::ostream&
52  _M_dot(std::ostream& __ostr) const = 0;
53 #endif
54  };
55 
56  // Generic shared pointer to an automaton.
57  typedef std::shared_ptr<_Automaton> _AutomatonPtr;
58 
59  // Operation codes that define the type of transitions within the base NFA
60  // that represents the regular expression.
61  enum _Opcode
62  {
63  _S_opcode_unknown = 0,
64  _S_opcode_alternative = 1,
65  _S_opcode_subexpr_begin = 4,
66  _S_opcode_subexpr_end = 5,
67  _S_opcode_match = 100,
68  _S_opcode_accept = 255
69  };
70 
71  // Provides a generic facade for a templated match_results.
72  struct _Results
73  {
74  virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0;
75  virtual void _M_set_matched(int __i, bool __is_matched) = 0;
76  };
77 
78  // Tags current state (for subexpr begin/end).
79  typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger;
80 
81  template<typename _FwdIterT, typename _TraitsT>
82  struct _StartTagger
83  {
84  explicit
85  _StartTagger(int __i)
86  : _M_index(__i)
87  { }
88 
89  void
90  operator()(const _PatternCursor& __pc, _Results& __r)
91  { __r._M_set_pos(_M_index, 0, __pc); }
92 
93  int _M_index;
94  };
95 
96  template<typename _FwdIterT, typename _TraitsT>
97  struct _EndTagger
98  {
99  explicit
100  _EndTagger(int __i)
101  : _M_index(__i)
102  { }
103 
104  void
105  operator()(const _PatternCursor& __pc, _Results& __r)
106  { __r._M_set_pos(_M_index, 1, __pc); }
107 
108  int _M_index;
109  _FwdIterT _M_pos;
110  };
111  // Indicates if current state matches cursor current.
112  typedef std::function<bool (const _PatternCursor&)> _Matcher;
113 
114  // Matches any character
115  inline bool
116  _AnyMatcher(const _PatternCursor&)
117  { return true; }
118 
119  // Matches a single character
120  template<typename _InIterT, typename _TraitsT>
121  struct _CharMatcher
122  {
123  typedef typename _TraitsT::char_type char_type;
124 
125  explicit
126  _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT())
127  : _M_traits(__t), _M_c(_M_traits.translate(__c))
128  { }
129 
130  bool
131  operator()(const _PatternCursor& __pc) const
132  {
133  typedef const _SpecializedCursor<_InIterT>& _CursorT;
134  _CursorT __c = static_cast<_CursorT>(__pc);
135  return _M_traits.translate(__c._M_current()) == _M_c;
136  }
137 
138  const _TraitsT& _M_traits;
139  char_type _M_c;
140  };
141 
142  // Matches a character range (bracket expression)
143  template<typename _InIterT, typename _TraitsT>
144  struct _RangeMatcher
145  {
146  typedef typename _TraitsT::char_type _CharT;
147  typedef std::basic_string<_CharT> _StringT;
148 
149  explicit
150  _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT())
151  : _M_traits(__t), _M_is_non_matching(__is_non_matching)
152  { }
153 
154  bool
155  operator()(const _PatternCursor& __pc) const
156  {
157  typedef const _SpecializedCursor<_InIterT>& _CursorT;
158  _CursorT __c = static_cast<_CursorT>(__pc);
159  return true;
160  }
161 
162  void
163  _M_add_char(_CharT __c)
164  { }
165 
166  void
167  _M_add_collating_element(const _StringT& __s)
168  { }
169 
170  void
171  _M_add_equivalence_class(const _StringT& __s)
172  { }
173 
174  void
175  _M_add_character_class(const _StringT& __s)
176  { }
177 
178  void
179  _M_make_range()
180  { }
181 
182  const _TraitsT& _M_traits;
183  bool _M_is_non_matching;
184  };
185 
186  // Identifies a state in the NFA.
187  typedef int _StateIdT;
188 
189  // The special case in which a state identifier is not an index.
190  static const _StateIdT _S_invalid_state_id = -1;
191 
192 
193  // An individual state in an NFA
194  //
195  // In this case a "state" is an entry in the NFA definition coupled with its
196  // outgoing transition(s). All states have a single outgoing transition,
197  // except for accepting states (which have no outgoing transitions) and alt
198  // states, which have two outgoing transitions.
199  //
200  struct _State
201  {
202  typedef int _OpcodeT;
203 
204  _OpcodeT _M_opcode; // type of outgoing transition
205  _StateIdT _M_next; // outgoing transition
206  _StateIdT _M_alt; // for _S_opcode_alternative
207  unsigned int _M_subexpr; // for _S_opcode_subexpr_*
208  _Tagger _M_tagger; // for _S_opcode_subexpr_*
209  _Matcher _M_matches; // for _S_opcode_match
210 
211  explicit _State(_OpcodeT __opcode)
212  : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
213  { }
214 
215  _State(const _Matcher& __m)
216  : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m)
217  { }
218 
219  _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t)
220  : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s),
221  _M_tagger(__t)
222  { }
223 
224  _State(_StateIdT __next, _StateIdT __alt)
225  : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
226  { }
227 
228 #ifdef _GLIBCXX_DEBUG
229  std::ostream&
230  _M_print(std::ostream& ostr) const;
231 
232  // Prints graphviz dot commands for state.
233  std::ostream&
234  _M_dot(std::ostream& __ostr, _StateIdT __id) const;
235 #endif
236  };
237 
238 
239  // The Grep Matcher works on sets of states. Here are sets of states.
240  typedef std::set<_StateIdT> _StateSet;
241 
242  // A collection of all states making up an NFA
243  //
244  // An NFA is a 4-tuple M = (K, S, s, F), where
245  // K is a finite set of states,
246  // S is the alphabet of the NFA,
247  // s is the initial state,
248  // F is a set of final (accepting) states.
249  //
250  // This NFA class is templated on S, a type that will hold values of the
251  // underlying alphabet (without regard to semantics of that alphabet). The
252  // other elements of the tuple are generated during construction of the NFA
253  // and are available through accessor member functions.
254  //
255  class _Nfa
256  : public _Automaton, public std::vector<_State>
257  {
258  public:
259  typedef _State _StateT;
260  typedef unsigned int _SizeT;
262 
263  public:
264  _Nfa(_FlagT __f)
265  : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0)
266  { }
267 
268  ~_Nfa()
269  { }
270 
271  _FlagT
272  _M_options() const
273  { return _M_flags; }
274 
275  _StateIdT
276  _M_start() const
277  { return _M_start_state; }
278 
279  const _StateSet&
280  _M_final_states() const
281  { return _M_accepting_states; }
282 
283  _SizeT
284  _M_sub_count() const
285  { return _M_subexpr_count; }
286 
287  _StateIdT
288  _M_insert_accept()
289  {
290  this->push_back(_StateT(_S_opcode_accept));
291  _M_accepting_states.insert(this->size()-1);
292  return this->size()-1;
293  }
294 
295  _StateIdT
296  _M_insert_alt(_StateIdT __next, _StateIdT __alt)
297  {
298  this->push_back(_StateT(__next, __alt));
299  return this->size()-1;
300  }
301 
302  _StateIdT
303  _M_insert_matcher(_Matcher __m)
304  {
305  this->push_back(_StateT(__m));
306  return this->size()-1;
307  }
308 
309  _StateIdT
310  _M_insert_subexpr_begin(const _Tagger& __t)
311  {
312  this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++, __t));
313  return this->size()-1;
314  }
315 
316  _StateIdT
317  _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t)
318  {
319  this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t));
320  return this->size()-1;
321  }
322 
323 #ifdef _GLIBCXX_DEBUG
324  std::ostream&
325  _M_dot(std::ostream& __ostr) const;
326 #endif
327 
328  private:
329  _FlagT _M_flags;
330  _StateIdT _M_start_state;
331  _StateSet _M_accepting_states;
332  _SizeT _M_subexpr_count;
333  };
334 
335  // Describes a sequence of one or more %_State, its current start and end(s).
336  //
337  // This structure contains fragments of an NFA during construction.
338  class _StateSeq
339  {
340  public:
341  // Constructs a single-node sequence
342  _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id)
343  : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e)
344  { }
345  // Constructs a split sequence from two other sequencces
346  _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2)
347  : _M_nfa(__e1._M_nfa),
348  _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)),
349  _M_end1(__e1._M_end1), _M_end2(__e2._M_end1)
350  { }
351 
352  // Constructs a split sequence from a single sequence
353  _StateSeq(const _StateSeq& __e, _StateIdT __id)
354  : _M_nfa(__e._M_nfa),
355  _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)),
356  _M_end1(__id), _M_end2(__e._M_end1)
357  { }
358 
359  // Constructs a copy of a %_StateSeq
360  _StateSeq(const _StateSeq& __rhs)
361  : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start),
362  _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2)
363  { }
364 
365 
366  _StateSeq& operator=(const _StateSeq& __rhs);
367 
368  _StateIdT
369  _M_front() const
370  { return _M_start; }
371 
372  // Extends a sequence by one.
373  void
374  _M_push_back(_StateIdT __id);
375 
376  // Extends and maybe joins a sequence.
377  void
378  _M_append(_StateIdT __id);
379 
380  void
381  _M_append(_StateSeq& __rhs);
382 
383  // Clones an entire sequence.
384  _StateIdT
385  _M_clone();
386 
387  private:
388  _Nfa& _M_nfa;
389  _StateIdT _M_start;
390  _StateIdT _M_end1;
391  _StateIdT _M_end2;
392 
393  };
394 
395 _GLIBCXX_END_NAMESPACE_VERSION
396 } // namespace __regex
397 } // namespace std
398 
399 #include <bits/regex_nfa.tcc>
400