libstdc++
regex_nfa.h
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2010-2013 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_nfa.h
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 namespace std _GLIBCXX_VISIBILITY(default)
32 {
33 namespace __detail
34 {
35 _GLIBCXX_BEGIN_NAMESPACE_VERSION
36 
37  /**
38  * @addtogroup regex-detail
39  * @{
40  */
41 
42  /// Base class for, um, automata. Could be an NFA or a DFA. Your choice.
43  class _Automaton
44  {
45  public:
46  typedef unsigned int _SizeT;
47 
48  public:
49  virtual
50  ~_Automaton() { }
51 
52  virtual _SizeT
53  _M_sub_count() const = 0;
54 
55 #ifdef _GLIBCXX_DEBUG
56  virtual std::ostream&
57  _M_dot(std::ostream& __ostr) const = 0;
58 #endif
59  };
60 
61  /// Generic shared pointer to an automaton.
63 
64  /// Operation codes that define the type of transitions within the base NFA
65  /// that represents the regular expression.
66  enum _Opcode
67  {
68  _S_opcode_unknown = 0,
69  _S_opcode_alternative = 1,
70  _S_opcode_subexpr_begin = 4,
71  _S_opcode_subexpr_end = 5,
72  _S_opcode_match = 100,
73  _S_opcode_accept = 255
74  };
75 
76  /// Provides a generic facade for a templated match_results.
77  struct _Results
78  {
79  virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0;
80  virtual void _M_set_matched(int __i, bool __is_matched) = 0;
81  };
82 
83  /// Tags current state (for subexpr begin/end).
84  typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger;
85 
86  /// Start state tag.
87  template<typename _FwdIterT, typename _TraitsT>
88  struct _StartTagger
89  {
90  explicit
91  _StartTagger(int __i)
92  : _M_index(__i)
93  { }
94 
95  void
96  operator()(const _PatternCursor& __pc, _Results& __r)
97  { __r._M_set_pos(_M_index, 0, __pc); }
98 
99  int _M_index;
100  };
101 
102  /// End state tag.
103  template<typename _FwdIterT, typename _TraitsT>
104  struct _EndTagger
105  {
106  explicit
107  _EndTagger(int __i)
108  : _M_index(__i)
109  { }
110 
111  void
112  operator()(const _PatternCursor& __pc, _Results& __r)
113  { __r._M_set_pos(_M_index, 1, __pc); }
114 
115  int _M_index;
116  _FwdIterT _M_pos;
117  };
118 
119  /// Indicates if current state matches cursor current.
120  typedef std::function<bool (const _PatternCursor&)> _Matcher;
121 
122  /// Matches any character
123  inline bool
125  { return true; }
126 
127  /// Matches a single character
128  template<typename _InIterT, typename _TraitsT>
130  {
131  typedef typename _TraitsT::char_type char_type;
132 
133  explicit
134  _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT())
135  : _M_traits(__t), _M_c(_M_traits.translate(__c))
136  { }
137 
138  bool
139  operator()(const _PatternCursor& __pc) const
140  {
141  typedef const _SpecializedCursor<_InIterT>& _CursorT;
142  _CursorT __c = static_cast<_CursorT>(__pc);
143  return _M_traits.translate(__c._M_current()) == _M_c;
144  }
145 
146  const _TraitsT& _M_traits;
147  char_type _M_c;
148  };
149 
150  /// Matches a character range (bracket expression)
151  template<typename _InIterT, typename _TraitsT>
153  {
154  typedef typename _TraitsT::char_type _CharT;
156 
157  explicit
158  _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT())
159  : _M_traits(__t), _M_is_non_matching(__is_non_matching)
160  { }
161 
162  bool
163  operator()(const _PatternCursor& __pc) const
164  {
165  typedef const _SpecializedCursor<_InIterT>& _CursorT;
166  _CursorT __c = static_cast<_CursorT>(__pc);
167  return true;
168  }
169 
170  void
171  _M_add_char(_CharT __c)
172  { }
173 
174  void
175  _M_add_collating_element(const _StringT& __s)
176  { }
177 
178  void
179  _M_add_equivalence_class(const _StringT& __s)
180  { }
181 
182  void
183  _M_add_character_class(const _StringT& __s)
184  { }
185 
186  void
187  _M_make_range()
188  { }
189 
190  const _TraitsT& _M_traits;
191  bool _M_is_non_matching;
192  };
193 
194  /// Identifies a state in the NFA.
195  typedef int _StateIdT;
196 
197  /// The special case in which a state identifier is not an index.
198  static const _StateIdT _S_invalid_state_id = -1;
199 
200 
201  /**
202  * @brief struct _State
203  *
204  * An individual state in an NFA
205  *
206  * In this case a "state" is an entry in the NFA definition coupled
207  * with its outgoing transition(s). All states have a single outgoing
208  * transition, except for accepting states (which have no outgoing
209  * transitions) and alt states, which have two outgoing transitions.
210  */
211  struct _State
212  {
213  typedef int _OpcodeT;
214 
215  _OpcodeT _M_opcode; // type of outgoing transition
216  _StateIdT _M_next; // outgoing transition
217  _StateIdT _M_alt; // for _S_opcode_alternative
218  unsigned int _M_subexpr; // for _S_opcode_subexpr_*
219  _Tagger _M_tagger; // for _S_opcode_subexpr_*
220  _Matcher _M_matches; // for _S_opcode_match
221 
222  explicit _State(_OpcodeT __opcode)
223  : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
224  { }
225 
226  _State(const _Matcher& __m)
227  : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m)
228  { }
229 
230  _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t)
231  : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s),
232  _M_tagger(__t)
233  { }
234 
235  _State(_StateIdT __next, _StateIdT __alt)
236  : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
237  { }
238 
239 #ifdef _GLIBCXX_DEBUG
240  std::ostream&
241  _M_print(std::ostream& ostr) const;
242 
243  // Prints graphviz dot commands for state.
244  std::ostream&
245  _M_dot(std::ostream& __ostr, _StateIdT __id) const;
246 #endif
247  };
248 
249 
250  /// The Grep Matcher works on sets of states. Here are sets of states.
252 
253  /**
254  * @brief struct _Nfa
255  *
256  * A collection of all states making up an NFA.
257  *
258  * An NFA is a 4-tuple M = (K, S, s, F), where
259  * K is a finite set of states,
260  * S is the alphabet of the NFA,
261  * s is the initial state,
262  * F is a set of final (accepting) states.
263  *
264  * This NFA class is templated on S, a type that will hold values of the
265  * underlying alphabet (without regard to semantics of that alphabet). The
266  * other elements of the tuple are generated during construction of the NFA
267  * and are available through accessor member functions.
268  */
269  class _Nfa
270  : public _Automaton, public std::vector<_State>
271  {
272  public:
273  typedef _State _StateT;
274  typedef unsigned int _SizeT;
276 
277  _Nfa(_FlagT __f)
278  : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0)
279  { }
280 
281  ~_Nfa()
282  { }
283 
284  _FlagT
285  _M_options() const
286  { return _M_flags; }
287 
288  _StateIdT
289  _M_start() const
290  { return _M_start_state; }
291 
292  const _StateSet&
293  _M_final_states() const
294  { return _M_accepting_states; }
295 
296  _SizeT
297  _M_sub_count() const
298  { return _M_subexpr_count; }
299 
300  _StateIdT
301  _M_insert_accept()
302  {
303  this->push_back(_StateT(_S_opcode_accept));
304  _M_accepting_states.insert(this->size()-1);
305  return this->size()-1;
306  }
307 
308  _StateIdT
309  _M_insert_alt(_StateIdT __next, _StateIdT __alt)
310  {
311  this->push_back(_StateT(__next, __alt));
312  return this->size()-1;
313  }
314 
315  _StateIdT
316  _M_insert_matcher(_Matcher __m)
317  {
318  this->push_back(_StateT(__m));
319  return this->size()-1;
320  }
321 
322  _StateIdT
323  _M_insert_subexpr_begin(const _Tagger& __t)
324  {
325  this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++,
326  __t));
327  return this->size()-1;
328  }
329 
330  _StateIdT
331  _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t)
332  {
333  this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t));
334  return this->size()-1;
335  }
336 
337 #ifdef _GLIBCXX_DEBUG
338  std::ostream&
339  _M_dot(std::ostream& __ostr) const;
340 #endif
341 
342  private:
343  _FlagT _M_flags;
344  _StateIdT _M_start_state;
345  _StateSet _M_accepting_states;
346  _SizeT _M_subexpr_count;
347  };
348 
349  /// Describes a sequence of one or more %_State, its current start
350  /// and end(s). This structure contains fragments of an NFA during
351  /// construction.
352  class _StateSeq
353  {
354  public:
355  // Constructs a single-node sequence
356  _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id)
357  : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e)
358  { }
359  // Constructs a split sequence from two other sequencces
360  _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2)
361  : _M_nfa(__e1._M_nfa),
362  _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)),
363  _M_end1(__e1._M_end1), _M_end2(__e2._M_end1)
364  { }
365 
366  // Constructs a split sequence from a single sequence
367  _StateSeq(const _StateSeq& __e, _StateIdT __id)
368  : _M_nfa(__e._M_nfa),
369  _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)),
370  _M_end1(__id), _M_end2(__e._M_end1)
371  { }
372 
373  // Constructs a copy of a %_StateSeq
374  _StateSeq(const _StateSeq& __rhs)
375  : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start),
376  _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2)
377  { }
378 
379 
380  _StateSeq& operator=(const _StateSeq& __rhs);
381 
382  _StateIdT
383  _M_front() const
384  { return _M_start; }
385 
386  // Extends a sequence by one.
387  void
388  _M_push_back(_StateIdT __id);
389 
390  // Extends and maybe joins a sequence.
391  void
392  _M_append(_StateIdT __id);
393 
394  void
395  _M_append(_StateSeq& __rhs);
396 
397  // Clones an entire sequence.
398  _StateIdT
399  _M_clone();
400 
401  private:
402  _Nfa& _M_nfa;
403  _StateIdT _M_start;
404  _StateIdT _M_end1;
405  _StateIdT _M_end2;
406 
407  };
408 
409  //@} regex-detail
410 _GLIBCXX_END_NAMESPACE_VERSION
411 } // namespace __detail
412 } // namespace std
413 
414 #include <bits/regex_nfa.tcc>
415 
void push_back(const value_type &__x)
Add data to the end of the vector.
Definition: stl_vector.h:901
A standard container which offers fixed time access to individual elements in any order...
Definition: stl_vector.h:210
Matches a single character.
Definition: regex_nfa.h:129
Start state tag.
Definition: regex_nfa.h:88
struct _Nfa
Definition: regex_nfa.h:269
ABC for pattern matching.
Definition: regex_cursor.h:44
Provides a generic facade for a templated match_results.
Definition: regex_nfa.h:77
Base class for, um, automata. Could be an NFA or a DFA. Your choice.
Definition: regex_nfa.h:43
std::pair< iterator, bool > insert(const value_type &__x)
Attempts to insert an element into the set.
Definition: stl_set.h:460
_Opcode
Operation codes that define the type of transitions within the base NFA that represents the regular e...
Definition: regex_nfa.h:66
static const _StateIdT _S_invalid_state_id
The special case in which a state identifier is not an index.
Definition: regex_nfa.h:198
ISO C++ entities toplevel namespace is std.
Describes a sequence of one or more _State, its current start and end(s). This structure contains fra...
Definition: regex_nfa.h:352
std::function< void(const _PatternCursor &, _Results &)> _Tagger
Tags current state (for subexpr begin/end).
Definition: regex_nfa.h:84
std::shared_ptr< _Automaton > _AutomatonPtr
Generic shared pointer to an automaton.
Definition: regex_nfa.h:62
Matches a character range (bracket expression)
Definition: regex_nfa.h:152
bool _AnyMatcher(const _PatternCursor &)
Matches any character.
Definition: regex_nfa.h:124
unsigned int syntax_option_type
This is a bitmask type indicating how to interpret the regex.
Provides a cursor into the specific target string.
Definition: regex_cursor.h:53
End state tag.
Definition: regex_nfa.h:104
struct _State
Definition: regex_nfa.h:211
std::set< _StateIdT > _StateSet
The Grep Matcher works on sets of states. Here are sets of states.
Definition: regex_nfa.h:251
size_type size() const noexcept
Definition: stl_vector.h:645
int _StateIdT
Identifies a state in the NFA.
Definition: regex_nfa.h:195
std::function< bool(const _PatternCursor &)> _Matcher
Indicates if current state matches cursor current.
Definition: regex_nfa.h:120