libstdc++
regex_scanner.tcc
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2016 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_scanner.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // FIXME make comments doxygen format.
32 
33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
34 // and awk
35 // 1) grep is basic except '\n' is treated as '|'
36 // 2) egrep is extended except '\n' is treated as '|'
37 // 3) awk is extended except special escaping rules, and there's no
38 // back-reference.
39 //
40 // References:
41 //
42 // ECMAScript: ECMA-262 15.10
43 //
44 // basic, extended:
45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
46 //
47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
48 
49 namespace std _GLIBCXX_VISIBILITY(default)
50 {
51 namespace __detail
52 {
53 _GLIBCXX_BEGIN_NAMESPACE_VERSION
54 
55  template<typename _CharT>
56  _Scanner<_CharT>::
57  _Scanner(typename _Scanner::_IterT __begin,
58  typename _Scanner::_IterT __end,
59  _FlagT __flags, std::locale __loc)
60  : _ScannerBase(__flags),
61  _M_current(__begin), _M_end(__end),
62  _M_ctype(std::use_facet<_CtypeT>(__loc)),
63  _M_eat_escape(_M_is_ecma()
64  ? &_Scanner::_M_eat_escape_ecma
65  : &_Scanner::_M_eat_escape_posix)
66  { _M_advance(); }
67 
68  template<typename _CharT>
69  void
70  _Scanner<_CharT>::
71  _M_advance()
72  {
73  if (_M_current == _M_end)
74  {
75  _M_token = _S_token_eof;
76  return;
77  }
78 
79  if (_M_state == _S_state_normal)
80  _M_scan_normal();
81  else if (_M_state == _S_state_in_bracket)
82  _M_scan_in_bracket();
83  else if (_M_state == _S_state_in_brace)
84  _M_scan_in_brace();
85  else
86  {
87  __glibcxx_assert(false);
88  }
89  }
90 
91  // Differences between styles:
92  // 1) "\(", "\)", "\{" in basic. It's not escaping.
93  // 2) "(?:", "(?=", "(?!" in ECMAScript.
94  template<typename _CharT>
95  void
96  _Scanner<_CharT>::
97  _M_scan_normal()
98  {
99  auto __c = *_M_current++;
100 
101  if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
102  {
103  _M_token = _S_token_ord_char;
104  _M_value.assign(1, __c);
105  return;
106  }
107  if (__c == '\\')
108  {
109  if (_M_current == _M_end)
110  __throw_regex_error(
112  "Unexpected end of regex when escaping.");
113 
114  if (!_M_is_basic()
115  || (*_M_current != '('
116  && *_M_current != ')'
117  && *_M_current != '{'))
118  {
119  (this->*_M_eat_escape)();
120  return;
121  }
122  __c = *_M_current++;
123  }
124  if (__c == '(')
125  {
126  if (_M_is_ecma() && *_M_current == '?')
127  {
128  if (++_M_current == _M_end)
129  __throw_regex_error(
131  "Unexpected end of regex when in an open parenthesis.");
132 
133  if (*_M_current == ':')
134  {
135  ++_M_current;
136  _M_token = _S_token_subexpr_no_group_begin;
137  }
138  else if (*_M_current == '=')
139  {
140  ++_M_current;
141  _M_token = _S_token_subexpr_lookahead_begin;
142  _M_value.assign(1, 'p');
143  }
144  else if (*_M_current == '!')
145  {
146  ++_M_current;
147  _M_token = _S_token_subexpr_lookahead_begin;
148  _M_value.assign(1, 'n');
149  }
150  else
151  __throw_regex_error(
153  "Invalid special open parenthesis.");
154  }
155  else if (_M_flags & regex_constants::nosubs)
156  _M_token = _S_token_subexpr_no_group_begin;
157  else
158  _M_token = _S_token_subexpr_begin;
159  }
160  else if (__c == ')')
161  _M_token = _S_token_subexpr_end;
162  else if (__c == '[')
163  {
164  _M_state = _S_state_in_bracket;
165  _M_at_bracket_start = true;
166  if (_M_current != _M_end && *_M_current == '^')
167  {
168  _M_token = _S_token_bracket_neg_begin;
169  ++_M_current;
170  }
171  else
172  _M_token = _S_token_bracket_begin;
173  }
174  else if (__c == '{')
175  {
176  _M_state = _S_state_in_brace;
177  _M_token = _S_token_interval_begin;
178  }
179  else if (__c != ']' && __c != '}')
180  {
181  auto __it = _M_token_tbl;
182  auto __narrowc = _M_ctype.narrow(__c, '\0');
183  for (; __it->first != '\0'; ++__it)
184  if (__it->first == __narrowc)
185  {
186  _M_token = __it->second;
187  return;
188  }
189  __glibcxx_assert(false);
190  }
191  else
192  {
193  _M_token = _S_token_ord_char;
194  _M_value.assign(1, __c);
195  }
196  }
197 
198  // Differences between styles:
199  // 1) different semantics of "[]" and "[^]".
200  // 2) Escaping in bracket expr.
201  template<typename _CharT>
202  void
203  _Scanner<_CharT>::
204  _M_scan_in_bracket()
205  {
206  if (_M_current == _M_end)
207  __throw_regex_error(
209  "Unexpected end of regex when in bracket expression.");
210 
211  auto __c = *_M_current++;
212 
213  if (__c == '[')
214  {
215  if (_M_current == _M_end)
216  __throw_regex_error(regex_constants::error_brack,
217  "Unexpected character class open bracket.");
218 
219  if (*_M_current == '.')
220  {
221  _M_token = _S_token_collsymbol;
222  _M_eat_class(*_M_current++);
223  }
224  else if (*_M_current == ':')
225  {
226  _M_token = _S_token_char_class_name;
227  _M_eat_class(*_M_current++);
228  }
229  else if (*_M_current == '=')
230  {
231  _M_token = _S_token_equiv_class_name;
232  _M_eat_class(*_M_current++);
233  }
234  else
235  {
236  _M_token = _S_token_ord_char;
237  _M_value.assign(1, __c);
238  }
239  }
240  // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
241  // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
242  // `*/empty_range.cc`.
243  else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
244  {
245  _M_token = _S_token_bracket_end;
246  _M_state = _S_state_normal;
247  }
248  // ECMAScript and awk permits escaping in bracket.
249  else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
250  (this->*_M_eat_escape)();
251  else
252  {
253  _M_token = _S_token_ord_char;
254  _M_value.assign(1, __c);
255  }
256  _M_at_bracket_start = false;
257  }
258 
259  // Differences between styles:
260  // 1) "\}" in basic style.
261  template<typename _CharT>
262  void
263  _Scanner<_CharT>::
264  _M_scan_in_brace()
265  {
266  if (_M_current == _M_end)
267  __throw_regex_error(
269  "Unexpected end of regex when in brace expression.");
270 
271  auto __c = *_M_current++;
272 
273  if (_M_ctype.is(_CtypeT::digit, __c))
274  {
275  _M_token = _S_token_dup_count;
276  _M_value.assign(1, __c);
277  while (_M_current != _M_end
278  && _M_ctype.is(_CtypeT::digit, *_M_current))
279  _M_value += *_M_current++;
280  }
281  else if (__c == ',')
282  _M_token = _S_token_comma;
283  // basic use \}.
284  else if (_M_is_basic())
285  {
286  if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
287  {
288  _M_state = _S_state_normal;
289  _M_token = _S_token_interval_end;
290  ++_M_current;
291  }
292  else
293  __throw_regex_error(regex_constants::error_badbrace,
294  "Unexpected character in brace expression.");
295  }
296  else if (__c == '}')
297  {
298  _M_state = _S_state_normal;
299  _M_token = _S_token_interval_end;
300  }
301  else
302  __throw_regex_error(regex_constants::error_badbrace,
303  "Unexpected character in brace expression.");
304  }
305 
306  template<typename _CharT>
307  void
308  _Scanner<_CharT>::
309  _M_eat_escape_ecma()
310  {
311  if (_M_current == _M_end)
312  __throw_regex_error(regex_constants::error_escape,
313  "Unexpected end of regex when escaping.");
314 
315  auto __c = *_M_current++;
316  auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
317 
318  if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
319  {
320  _M_token = _S_token_ord_char;
321  _M_value.assign(1, *__pos);
322  }
323  else if (__c == 'b')
324  {
325  _M_token = _S_token_word_bound;
326  _M_value.assign(1, 'p');
327  }
328  else if (__c == 'B')
329  {
330  _M_token = _S_token_word_bound;
331  _M_value.assign(1, 'n');
332  }
333  // N3376 28.13
334  else if (__c == 'd'
335  || __c == 'D'
336  || __c == 's'
337  || __c == 'S'
338  || __c == 'w'
339  || __c == 'W')
340  {
341  _M_token = _S_token_quoted_class;
342  _M_value.assign(1, __c);
343  }
344  else if (__c == 'c')
345  {
346  if (_M_current == _M_end)
347  __throw_regex_error(
349  "Unexpected end of regex when reading control code.");
350  _M_token = _S_token_ord_char;
351  _M_value.assign(1, *_M_current++);
352  }
353  else if (__c == 'x' || __c == 'u')
354  {
355  _M_value.erase();
356  for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
357  {
358  if (_M_current == _M_end
359  || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
360  __throw_regex_error(
362  "Unexpected end of regex when ascii character.");
363  _M_value += *_M_current++;
364  }
365  _M_token = _S_token_hex_num;
366  }
367  // ECMAScript recognizes multi-digit back-references.
368  else if (_M_ctype.is(_CtypeT::digit, __c))
369  {
370  _M_value.assign(1, __c);
371  while (_M_current != _M_end
372  && _M_ctype.is(_CtypeT::digit, *_M_current))
373  _M_value += *_M_current++;
374  _M_token = _S_token_backref;
375  }
376  else
377  {
378  _M_token = _S_token_ord_char;
379  _M_value.assign(1, __c);
380  }
381  }
382 
383  // Differences between styles:
384  // 1) Extended doesn't support backref, but basic does.
385  template<typename _CharT>
386  void
387  _Scanner<_CharT>::
388  _M_eat_escape_posix()
389  {
390  if (_M_current == _M_end)
391  __throw_regex_error(regex_constants::error_escape,
392  "Unexpected end of regex when escaping.");
393 
394  auto __c = *_M_current;
395  auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
396 
397  if (__pos != nullptr && *__pos != '\0')
398  {
399  _M_token = _S_token_ord_char;
400  _M_value.assign(1, __c);
401  }
402  // We MUST judge awk before handling backrefs. There's no backref in awk.
403  else if (_M_is_awk())
404  {
405  _M_eat_escape_awk();
406  return;
407  }
408  else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
409  {
410  _M_token = _S_token_backref;
411  _M_value.assign(1, __c);
412  }
413  else
414  {
415 #ifdef __STRICT_ANSI__
416  // POSIX says it is undefined to escape ordinary characters
417  __throw_regex_error(regex_constants::error_escape,
418  "Unexpected escape character.");
419 #else
420  _M_token = _S_token_ord_char;
421  _M_value.assign(1, __c);
422 #endif
423  }
424  ++_M_current;
425  }
426 
427  template<typename _CharT>
428  void
429  _Scanner<_CharT>::
430  _M_eat_escape_awk()
431  {
432  auto __c = *_M_current++;
433  auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
434 
435  if (__pos != nullptr)
436  {
437  _M_token = _S_token_ord_char;
438  _M_value.assign(1, *__pos);
439  }
440  // \ddd for oct representation
441  else if (_M_ctype.is(_CtypeT::digit, __c)
442  && __c != '8'
443  && __c != '9')
444  {
445  _M_value.assign(1, __c);
446  for (int __i = 0;
447  __i < 2
448  && _M_current != _M_end
449  && _M_ctype.is(_CtypeT::digit, *_M_current)
450  && *_M_current != '8'
451  && *_M_current != '9';
452  __i++)
453  _M_value += *_M_current++;
454  _M_token = _S_token_oct_num;
455  return;
456  }
457  else
458  __throw_regex_error(regex_constants::error_escape,
459  "Unexpected escape character.");
460  }
461 
462  // Eats a character class or throws an exception.
463  // __ch could be ':', '.' or '=', _M_current is the char after ']' when
464  // returning.
465  template<typename _CharT>
466  void
467  _Scanner<_CharT>::
468  _M_eat_class(char __ch)
469  {
470  for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
471  _M_value += *_M_current++;
472  if (_M_current == _M_end
473  || *_M_current++ != __ch
474  || _M_current == _M_end // skip __ch
475  || *_M_current++ != ']') // skip ']'
476  {
477  if (__ch == ':')
478  __throw_regex_error(regex_constants::error_ctype,
479  "Unexpected end of character class.");
480  else
481  __throw_regex_error(regex_constants::error_collate,
482  "Unexpected end of character class.");
483  }
484  }
485 
486 #ifdef _GLIBCXX_DEBUG
487  template<typename _CharT>
488  std::ostream&
489  _Scanner<_CharT>::
490  _M_print(std::ostream& ostr)
491  {
492  switch (_M_token)
493  {
494  case _S_token_anychar:
495  ostr << "any-character\n";
496  break;
497  case _S_token_backref:
498  ostr << "backref\n";
499  break;
500  case _S_token_bracket_begin:
501  ostr << "bracket-begin\n";
502  break;
503  case _S_token_bracket_neg_begin:
504  ostr << "bracket-neg-begin\n";
505  break;
506  case _S_token_bracket_end:
507  ostr << "bracket-end\n";
508  break;
509  case _S_token_char_class_name:
510  ostr << "char-class-name \"" << _M_value << "\"\n";
511  break;
512  case _S_token_closure0:
513  ostr << "closure0\n";
514  break;
515  case _S_token_closure1:
516  ostr << "closure1\n";
517  break;
518  case _S_token_collsymbol:
519  ostr << "collsymbol \"" << _M_value << "\"\n";
520  break;
521  case _S_token_comma:
522  ostr << "comma\n";
523  break;
524  case _S_token_dup_count:
525  ostr << "dup count: " << _M_value << "\n";
526  break;
527  case _S_token_eof:
528  ostr << "EOF\n";
529  break;
530  case _S_token_equiv_class_name:
531  ostr << "equiv-class-name \"" << _M_value << "\"\n";
532  break;
533  case _S_token_interval_begin:
534  ostr << "interval begin\n";
535  break;
536  case _S_token_interval_end:
537  ostr << "interval end\n";
538  break;
539  case _S_token_line_begin:
540  ostr << "line begin\n";
541  break;
542  case _S_token_line_end:
543  ostr << "line end\n";
544  break;
545  case _S_token_opt:
546  ostr << "opt\n";
547  break;
548  case _S_token_or:
549  ostr << "or\n";
550  break;
551  case _S_token_ord_char:
552  ostr << "ordinary character: \"" << _M_value << "\"\n";
553  break;
554  case _S_token_subexpr_begin:
555  ostr << "subexpr begin\n";
556  break;
557  case _S_token_subexpr_no_group_begin:
558  ostr << "no grouping subexpr begin\n";
559  break;
560  case _S_token_subexpr_lookahead_begin:
561  ostr << "lookahead subexpr begin\n";
562  break;
563  case _S_token_subexpr_end:
564  ostr << "subexpr end\n";
565  break;
566  case _S_token_unknown:
567  ostr << "-- unknown token --\n";
568  break;
569  case _S_token_oct_num:
570  ostr << "oct number " << _M_value << "\n";
571  break;
572  case _S_token_hex_num:
573  ostr << "hex number " << _M_value << "\n";
574  break;
575  case _S_token_quoted_class:
576  ostr << "quoted class " << "\\" << _M_value << "\n";
577  break;
578  default:
579  _GLIBCXX_DEBUG_ASSERT(false);
580  }
581  return ostr;
582  }
583 #endif
584 
585 _GLIBCXX_END_NAMESPACE_VERSION
586 } // namespace __detail
587 } // namespace
basic_string & assign(const basic_string &__str)
Set value to contents of another string.
constexpr error_type error_ctype(_S_error_ctype)
constexpr error_type error_paren(_S_error_paren)
constexpr syntax_option_type nosubs
bool is(mask __m, char_type __c) const
Test char_type classification.
constexpr error_type error_escape(_S_error_escape)
constexpr error_type error_brace(_S_error_brace)
constexpr error_type error_badbrace(_S_error_badbrace)
char narrow(char_type __c, char __dfault) const
Narrow char_type to char.
const _Facet & use_facet(const locale &__loc)
Return a facet.use_facet looks for and returns a reference to a facet of type Facet where Facet is th...
Container class for localization functionality.The locale class is first a class wrapper for C librar...
basic_string & erase(size_type __pos=0, size_type __n=npos)
Remove characters.
constexpr error_type error_brack(_S_error_brack)
constexpr error_type error_collate(_S_error_collate)
ISO C++ entities toplevel namespace is std.