libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2014 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // See below __regex_algo_impl to get what this is talking about. The default 00032 // value 1 indicated a conservative optimization without giving up worst case 00033 // performance. 00034 #ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 00035 #define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1 00036 #endif 00037 00038 namespace std _GLIBCXX_VISIBILITY(default) 00039 { 00040 namespace __detail 00041 { 00042 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00043 00044 // Result of merging regex_match and regex_search. 00045 // 00046 // __policy now can be _S_auto (auto dispatch) and _S_alternate (use 00047 // the other one if possible, for test purpose). 00048 // 00049 // That __match_mode is true means regex_match, else regex_search. 00050 template<typename _BiIter, typename _Alloc, 00051 typename _CharT, typename _TraitsT, 00052 _RegexExecutorPolicy __policy, 00053 bool __match_mode> 00054 bool 00055 __regex_algo_impl(_BiIter __s, 00056 _BiIter __e, 00057 match_results<_BiIter, _Alloc>& __m, 00058 const basic_regex<_CharT, _TraitsT>& __re, 00059 regex_constants::match_flag_type __flags) 00060 { 00061 if (__re._M_automaton == nullptr) 00062 return false; 00063 00064 typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m; 00065 __res.resize(__re._M_automaton->_M_sub_count() + 2); 00066 for (auto& __it : __res) 00067 __it.matched = false; 00068 00069 // This function decide which executor to use under given circumstances. 00070 // The _S_auto policy now is the following: if a NFA has no 00071 // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 00072 // quantifiers (*, +, ?), the BFS executor will be used, other wise 00073 // DFS executor. This is because DFS executor has a exponential upper 00074 // bound, but better best-case performace. Meanwhile, BFS executor can 00075 // effectively prevent from exponential-long time matching (which must 00076 // contains many quantifiers), but it's slower in average. 00077 // 00078 // For simple regex, BFS executor could be 2 or more times slower than 00079 // DFS executor. 00080 // 00081 // Of course, BFS executor cannot handle back-references. 00082 bool __ret; 00083 if (!__re._M_automaton->_M_has_backref 00084 && (__policy == _RegexExecutorPolicy::_S_alternate 00085 || __re._M_automaton->_M_quant_count 00086 > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT)) 00087 { 00088 _Executor<_BiIter, _Alloc, _TraitsT, false> 00089 __executor(__s, __e, __m, __re, __flags); 00090 if (__match_mode) 00091 __ret = __executor._M_match(); 00092 else 00093 __ret = __executor._M_search(); 00094 } 00095 else 00096 { 00097 _Executor<_BiIter, _Alloc, _TraitsT, true> 00098 __executor(__s, __e, __m, __re, __flags); 00099 if (__match_mode) 00100 __ret = __executor._M_match(); 00101 else 00102 __ret = __executor._M_search(); 00103 } 00104 if (__ret) 00105 { 00106 for (auto __it : __res) 00107 if (!__it.matched) 00108 __it.first = __it.second = __e; 00109 auto& __pre = __res[__res.size()-2]; 00110 auto& __suf = __res[__res.size()-1]; 00111 if (__match_mode) 00112 { 00113 __pre.matched = false; 00114 __pre.first = __s; 00115 __pre.second = __s; 00116 __suf.matched = false; 00117 __suf.first = __e; 00118 __suf.second = __e; 00119 } 00120 else 00121 { 00122 __pre.first = __s; 00123 __pre.second = __res[0].first; 00124 __pre.matched = (__pre.first != __pre.second); 00125 __suf.first = __res[0].second; 00126 __suf.second = __e; 00127 __suf.matched = (__suf.first != __suf.second); 00128 } 00129 } 00130 return __ret; 00131 } 00132 00133 _GLIBCXX_END_NAMESPACE_VERSION 00134 } 00135 00136 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00137 00138 template<typename _Ch_type> 00139 template<typename _Fwd_iter> 00140 typename regex_traits<_Ch_type>::string_type 00141 regex_traits<_Ch_type>:: 00142 lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const 00143 { 00144 typedef std::ctype<char_type> __ctype_type; 00145 const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale)); 00146 00147 static const char* __collatenames[] = 00148 { 00149 "NUL", 00150 "SOH", 00151 "STX", 00152 "ETX", 00153 "EOT", 00154 "ENQ", 00155 "ACK", 00156 "alert", 00157 "backspace", 00158 "tab", 00159 "newline", 00160 "vertical-tab", 00161 "form-feed", 00162 "carriage-return", 00163 "SO", 00164 "SI", 00165 "DLE", 00166 "DC1", 00167 "DC2", 00168 "DC3", 00169 "DC4", 00170 "NAK", 00171 "SYN", 00172 "ETB", 00173 "CAN", 00174 "EM", 00175 "SUB", 00176 "ESC", 00177 "IS4", 00178 "IS3", 00179 "IS2", 00180 "IS1", 00181 "space", 00182 "exclamation-mark", 00183 "quotation-mark", 00184 "number-sign", 00185 "dollar-sign", 00186 "percent-sign", 00187 "ampersand", 00188 "apostrophe", 00189 "left-parenthesis", 00190 "right-parenthesis", 00191 "asterisk", 00192 "plus-sign", 00193 "comma", 00194 "hyphen", 00195 "period", 00196 "slash", 00197 "zero", 00198 "one", 00199 "two", 00200 "three", 00201 "four", 00202 "five", 00203 "six", 00204 "seven", 00205 "eight", 00206 "nine", 00207 "colon", 00208 "semicolon", 00209 "less-than-sign", 00210 "equals-sign", 00211 "greater-than-sign", 00212 "question-mark", 00213 "commercial-at", 00214 "A", 00215 "B", 00216 "C", 00217 "D", 00218 "E", 00219 "F", 00220 "G", 00221 "H", 00222 "I", 00223 "J", 00224 "K", 00225 "L", 00226 "M", 00227 "N", 00228 "O", 00229 "P", 00230 "Q", 00231 "R", 00232 "S", 00233 "T", 00234 "U", 00235 "V", 00236 "W", 00237 "X", 00238 "Y", 00239 "Z", 00240 "left-square-bracket", 00241 "backslash", 00242 "right-square-bracket", 00243 "circumflex", 00244 "underscore", 00245 "grave-accent", 00246 "a", 00247 "b", 00248 "c", 00249 "d", 00250 "e", 00251 "f", 00252 "g", 00253 "h", 00254 "i", 00255 "j", 00256 "k", 00257 "l", 00258 "m", 00259 "n", 00260 "o", 00261 "p", 00262 "q", 00263 "r", 00264 "s", 00265 "t", 00266 "u", 00267 "v", 00268 "w", 00269 "x", 00270 "y", 00271 "z", 00272 "left-curly-bracket", 00273 "vertical-line", 00274 "right-curly-bracket", 00275 "tilde", 00276 "DEL", 00277 "" 00278 }; 00279 00280 // same as boost 00281 //static const char* __digraphs[] = 00282 // { 00283 // "ae", 00284 // "Ae", 00285 // "AE", 00286 // "ch", 00287 // "Ch", 00288 // "CH", 00289 // "ll", 00290 // "Ll", 00291 // "LL", 00292 // "ss", 00293 // "Ss", 00294 // "SS", 00295 // "nj", 00296 // "Nj", 00297 // "NJ", 00298 // "dz", 00299 // "Dz", 00300 // "DZ", 00301 // "lj", 00302 // "Lj", 00303 // "LJ", 00304 // "" 00305 // }; 00306 00307 std::string __s(__last - __first, '?'); 00308 __fctyp.narrow(__first, __last, '?', &*__s.begin()); 00309 00310 for (unsigned int __i = 0; *__collatenames[__i]; __i++) 00311 if (__s == __collatenames[__i]) 00312 return string_type(1, __fctyp.widen(static_cast<char>(__i))); 00313 00314 //for (unsigned int __i = 0; *__digraphs[__i]; __i++) 00315 // { 00316 // const char* __now = __digraphs[__i]; 00317 // if (__s == __now) 00318 // { 00319 // string_type ret(__s.size(), __fctyp.widen('?')); 00320 // __fctyp.widen(__now, __now + 2/* ouch */, &*ret.begin()); 00321 // return ret; 00322 // } 00323 // } 00324 return string_type(); 00325 } 00326 00327 template<typename _Ch_type> 00328 template<typename _Fwd_iter> 00329 typename regex_traits<_Ch_type>::char_class_type 00330 regex_traits<_Ch_type>:: 00331 lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const 00332 { 00333 typedef std::ctype<char_type> __ctype_type; 00334 typedef std::ctype<char> __cctype_type; 00335 typedef const pair<const char*, char_class_type> _ClassnameEntry; 00336 const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale)); 00337 const __cctype_type& __cctyp(use_facet<__cctype_type>(_M_locale)); 00338 00339 static _ClassnameEntry __classnames[] = 00340 { 00341 {"d", ctype_base::digit}, 00342 {"w", {ctype_base::alnum, _RegexMask::_S_under}}, 00343 {"s", ctype_base::space}, 00344 {"alnum", ctype_base::alnum}, 00345 {"alpha", ctype_base::alpha}, 00346 {"blank", {0, _RegexMask::_S_blank}}, 00347 {"cntrl", ctype_base::cntrl}, 00348 {"digit", ctype_base::digit}, 00349 {"graph", ctype_base::graph}, 00350 {"lower", ctype_base::lower}, 00351 {"print", ctype_base::print}, 00352 {"punct", ctype_base::punct}, 00353 {"space", ctype_base::space}, 00354 {"upper", ctype_base::upper}, 00355 {"xdigit", ctype_base::xdigit}, 00356 }; 00357 00358 std::string __s(__last - __first, '?'); 00359 __fctyp.narrow(__first, __last, '?', &__s[0]); 00360 __cctyp.tolower(&*__s.begin(), &*__s.begin() + __s.size()); 00361 for (_ClassnameEntry* __it = __classnames; 00362 __it < *(&__classnames + 1); 00363 ++__it) 00364 { 00365 if (__s == __it->first) 00366 { 00367 if (__icase 00368 && ((__it->second 00369 & (ctype_base::lower | ctype_base::upper)) != 0)) 00370 return ctype_base::alpha; 00371 return __it->second; 00372 } 00373 } 00374 return 0; 00375 } 00376 00377 template<typename _Ch_type> 00378 bool 00379 regex_traits<_Ch_type>:: 00380 isctype(_Ch_type __c, char_class_type __f) const 00381 { 00382 typedef std::ctype<char_type> __ctype_type; 00383 const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale)); 00384 00385 return __fctyp.is(__f._M_base, __c) 00386 // [[:w:]] 00387 || ((__f._M_extended & _RegexMask::_S_under) 00388 && __c == __fctyp.widen('_')) 00389 // [[:blank:]] 00390 || ((__f._M_extended & _RegexMask::_S_blank) 00391 && (__c == __fctyp.widen(' ') 00392 || __c == __fctyp.widen('\t'))); 00393 } 00394 00395 template<typename _Ch_type> 00396 int 00397 regex_traits<_Ch_type>:: 00398 value(_Ch_type __ch, int __radix) const 00399 { 00400 std::basic_istringstream<char_type> __is(string_type(1, __ch)); 00401 long __v; 00402 if (__radix == 8) 00403 __is >> std::oct; 00404 else if (__radix == 16) 00405 __is >> std::hex; 00406 __is >> __v; 00407 return __is.fail() ? -1 : __v; 00408 } 00409 00410 template<typename _Bi_iter, typename _Alloc> 00411 template<typename _Out_iter> 00412 _Out_iter match_results<_Bi_iter, _Alloc>:: 00413 format(_Out_iter __out, 00414 const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first, 00415 const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last, 00416 match_flag_type __flags) const 00417 { 00418 _GLIBCXX_DEBUG_ASSERT( ready() ); 00419 regex_traits<char_type> __traits; 00420 typedef std::ctype<char_type> __ctype_type; 00421 const __ctype_type& 00422 __fctyp(use_facet<__ctype_type>(__traits.getloc())); 00423 00424 auto __output = [&](size_t __idx) 00425 { 00426 auto& __sub = _Base_type::operator[](__idx); 00427 if (__sub.matched) 00428 __out = std::copy(__sub.first, __sub.second, __out); 00429 }; 00430 00431 if (__flags & regex_constants::format_sed) 00432 { 00433 for (; __fmt_first != __fmt_last;) 00434 if (*__fmt_first == '&') 00435 { 00436 __output(0); 00437 ++__fmt_first; 00438 } 00439 else if (*__fmt_first == '\\') 00440 { 00441 if (++__fmt_first != __fmt_last 00442 && __fctyp.is(__ctype_type::digit, *__fmt_first)) 00443 __output(__traits.value(*__fmt_first++, 10)); 00444 else 00445 *__out++ = '\\'; 00446 } 00447 else 00448 *__out++ = *__fmt_first++; 00449 } 00450 else 00451 { 00452 while (1) 00453 { 00454 auto __next = std::find(__fmt_first, __fmt_last, '$'); 00455 if (__next == __fmt_last) 00456 break; 00457 00458 __out = std::copy(__fmt_first, __next, __out); 00459 00460 auto __eat = [&](char __ch) -> bool 00461 { 00462 if (*__next == __ch) 00463 { 00464 ++__next; 00465 return true; 00466 } 00467 return false; 00468 }; 00469 00470 if (++__next == __fmt_last) 00471 *__out++ = '$'; 00472 else if (__eat('$')) 00473 *__out++ = '$'; 00474 else if (__eat('&')) 00475 __output(0); 00476 else if (__eat('`')) 00477 __output(_Base_type::size()-2); 00478 else if (__eat('\'')) 00479 __output(_Base_type::size()-1); 00480 else if (__fctyp.is(__ctype_type::digit, *__next)) 00481 { 00482 long __num = __traits.value(*__next, 10); 00483 if (++__next != __fmt_last 00484 && __fctyp.is(__ctype_type::digit, *__next)) 00485 { 00486 __num *= 10; 00487 __num += __traits.value(*__next++, 10); 00488 } 00489 if (0 <= __num && __num < this->size()) 00490 __output(__num); 00491 } 00492 else 00493 *__out++ = '$'; 00494 __fmt_first = __next; 00495 } 00496 __out = std::copy(__fmt_first, __fmt_last, __out); 00497 } 00498 return __out; 00499 } 00500 00501 template<typename _Out_iter, typename _Bi_iter, 00502 typename _Rx_traits, typename _Ch_type> 00503 _Out_iter 00504 regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last, 00505 const basic_regex<_Ch_type, _Rx_traits>& __e, 00506 const _Ch_type* __fmt, 00507 regex_constants::match_flag_type __flags) 00508 { 00509 typedef regex_iterator<_Bi_iter, _Ch_type, _Rx_traits> _IterT; 00510 _IterT __i(__first, __last, __e, __flags); 00511 _IterT __end; 00512 if (__i == __end) 00513 { 00514 if (!(__flags & regex_constants::format_no_copy)) 00515 __out = std::copy(__first, __last, __out); 00516 } 00517 else 00518 { 00519 sub_match<_Bi_iter> __last; 00520 auto __len = char_traits<_Ch_type>::length(__fmt); 00521 for (; __i != __end; ++__i) 00522 { 00523 if (!(__flags & regex_constants::format_no_copy)) 00524 __out = std::copy(__i->prefix().first, __i->prefix().second, 00525 __out); 00526 __out = __i->format(__out, __fmt, __fmt + __len, __flags); 00527 __last = __i->suffix(); 00528 if (__flags & regex_constants::format_first_only) 00529 break; 00530 } 00531 if (!(__flags & regex_constants::format_no_copy)) 00532 __out = std::copy(__last.first, __last.second, __out); 00533 } 00534 return __out; 00535 } 00536 00537 template<typename _Bi_iter, 00538 typename _Ch_type, 00539 typename _Rx_traits> 00540 bool 00541 regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00542 operator==(const regex_iterator& __rhs) const 00543 { 00544 return (_M_match.empty() && __rhs._M_match.empty()) 00545 || (_M_begin == __rhs._M_begin 00546 && _M_end == __rhs._M_end 00547 && _M_pregex == __rhs._M_pregex 00548 && _M_flags == __rhs._M_flags 00549 && _M_match[0] == __rhs._M_match[0]); 00550 } 00551 00552 template<typename _Bi_iter, 00553 typename _Ch_type, 00554 typename _Rx_traits> 00555 regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>& 00556 regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00557 operator++() 00558 { 00559 // In all cases in which the call to regex_search returns true, 00560 // match.prefix().first shall be equal to the previous value of 00561 // match[0].second, and for each index i in the half-open range 00562 // [0, match.size()) for which match[i].matched is true, 00563 // match[i].position() shall return distance(begin, match[i].first). 00564 // [28.12.1.4.5] 00565 if (_M_match[0].matched) 00566 { 00567 auto __start = _M_match[0].second; 00568 auto __prefix_first = _M_match[0].second; 00569 if (_M_match[0].first == _M_match[0].second) 00570 { 00571 if (__start == _M_end) 00572 { 00573 _M_match = value_type(); 00574 return *this; 00575 } 00576 else 00577 { 00578 if (regex_search(__start, _M_end, _M_match, *_M_pregex, 00579 _M_flags 00580 | regex_constants::match_not_null 00581 | regex_constants::match_continuous)) 00582 { 00583 _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched); 00584 _M_match.at(_M_match.size()).first = __prefix_first; 00585 _M_match._M_in_iterator = true; 00586 _M_match._M_begin = _M_begin; 00587 return *this; 00588 } 00589 else 00590 ++__start; 00591 } 00592 } 00593 _M_flags |= regex_constants::match_prev_avail; 00594 if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags)) 00595 { 00596 _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched); 00597 _M_match.at(_M_match.size()).first = __prefix_first; 00598 _M_match._M_in_iterator = true; 00599 _M_match._M_begin = _M_begin; 00600 } 00601 else 00602 _M_match = value_type(); 00603 } 00604 return *this; 00605 } 00606 00607 template<typename _Bi_iter, 00608 typename _Ch_type, 00609 typename _Rx_traits> 00610 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>& 00611 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00612 operator=(const regex_token_iterator& __rhs) 00613 { 00614 _M_position = __rhs._M_position; 00615 _M_subs = __rhs._M_subs; 00616 _M_n = __rhs._M_n; 00617 _M_result = __rhs._M_result; 00618 _M_suffix = __rhs._M_suffix; 00619 _M_has_m1 = __rhs._M_has_m1; 00620 if (__rhs._M_result == &__rhs._M_suffix) 00621 _M_result = &_M_suffix; 00622 return *this; 00623 } 00624 00625 template<typename _Bi_iter, 00626 typename _Ch_type, 00627 typename _Rx_traits> 00628 bool 00629 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00630 operator==(const regex_token_iterator& __rhs) const 00631 { 00632 if (_M_end_of_seq() && __rhs._M_end_of_seq()) 00633 return true; 00634 if (_M_suffix.matched && __rhs._M_suffix.matched 00635 && _M_suffix == __rhs._M_suffix) 00636 return true; 00637 if (_M_end_of_seq() || _M_suffix.matched 00638 || __rhs._M_end_of_seq() || __rhs._M_suffix.matched) 00639 return false; 00640 return _M_position == __rhs._M_position 00641 && _M_n == __rhs._M_n 00642 && _M_subs == __rhs._M_subs; 00643 } 00644 00645 template<typename _Bi_iter, 00646 typename _Ch_type, 00647 typename _Rx_traits> 00648 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>& 00649 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00650 operator++() 00651 { 00652 _Position __prev = _M_position; 00653 if (_M_suffix.matched) 00654 *this = regex_token_iterator(); 00655 else if (_M_n + 1 < _M_subs.size()) 00656 { 00657 _M_n++; 00658 _M_result = &_M_current_match(); 00659 } 00660 else 00661 { 00662 _M_n = 0; 00663 ++_M_position; 00664 if (_M_position != _Position()) 00665 _M_result = &_M_current_match(); 00666 else if (_M_has_m1 && __prev->suffix().length() != 0) 00667 { 00668 _M_suffix.matched = true; 00669 _M_suffix.first = __prev->suffix().first; 00670 _M_suffix.second = __prev->suffix().second; 00671 _M_result = &_M_suffix; 00672 } 00673 else 00674 *this = regex_token_iterator(); 00675 } 00676 return *this; 00677 } 00678 00679 template<typename _Bi_iter, 00680 typename _Ch_type, 00681 typename _Rx_traits> 00682 void 00683 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00684 _M_init(_Bi_iter __a, _Bi_iter __b) 00685 { 00686 _M_has_m1 = false; 00687 for (auto __it : _M_subs) 00688 if (__it == -1) 00689 { 00690 _M_has_m1 = true; 00691 break; 00692 } 00693 if (_M_position != _Position()) 00694 _M_result = &_M_current_match(); 00695 else if (_M_has_m1) 00696 { 00697 _M_suffix.matched = true; 00698 _M_suffix.first = __a; 00699 _M_suffix.second = __b; 00700 _M_result = &_M_suffix; 00701 } 00702 else 00703 _M_result = nullptr; 00704 } 00705 00706 _GLIBCXX_END_NAMESPACE_VERSION 00707 } // namespace 00708