libstdc++
regex_compiler.h
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2010-2017 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_compiler.h
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 namespace std _GLIBCXX_VISIBILITY(default)
00032 {
00033 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00034 _GLIBCXX_BEGIN_NAMESPACE_CXX11
00035 
00036   template<typename>
00037     class regex_traits;
00038 
00039 _GLIBCXX_END_NAMESPACE_CXX11
00040 _GLIBCXX_END_NAMESPACE_VERSION
00041 
00042 namespace __detail
00043 {
00044 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00045 
00046   /**
00047    * @addtogroup regex-detail
00048    * @{
00049    */
00050 
00051   template<typename, bool, bool>
00052     struct _BracketMatcher;
00053 
00054   /**
00055    * @brief Builds an NFA from an input iterator range.
00056    *
00057    * The %_TraitsT type should fulfill requirements [28.3].
00058    */
00059   template<typename _TraitsT>
00060     class _Compiler
00061     {
00062     public:
00063       typedef typename _TraitsT::char_type        _CharT;
00064       typedef const _CharT*                       _IterT;
00065       typedef _NFA<_TraitsT>                      _RegexT;
00066       typedef regex_constants::syntax_option_type _FlagT;
00067 
00068       _Compiler(_IterT __b, _IterT __e,
00069                 const typename _TraitsT::locale_type& __traits, _FlagT __flags);
00070 
00071       shared_ptr<const _RegexT>
00072       _M_get_nfa()
00073       { return std::move(_M_nfa); }
00074 
00075     private:
00076       typedef _Scanner<_CharT>               _ScannerT;
00077       typedef typename _TraitsT::string_type _StringT;
00078       typedef typename _ScannerT::_TokenT    _TokenT;
00079       typedef _StateSeq<_TraitsT>            _StateSeqT;
00080       typedef std::stack<_StateSeqT>         _StackT;
00081       typedef std::ctype<_CharT>             _CtypeT;
00082 
00083       // accepts a specific token or returns false.
00084       bool
00085       _M_match_token(_TokenT __token);
00086 
00087       void
00088       _M_disjunction();
00089 
00090       void
00091       _M_alternative();
00092 
00093       bool
00094       _M_term();
00095 
00096       bool
00097       _M_assertion();
00098 
00099       bool
00100       _M_quantifier();
00101 
00102       bool
00103       _M_atom();
00104 
00105       bool
00106       _M_bracket_expression();
00107 
00108       template<bool __icase, bool __collate>
00109         void
00110         _M_insert_any_matcher_ecma();
00111 
00112       template<bool __icase, bool __collate>
00113         void
00114         _M_insert_any_matcher_posix();
00115 
00116       template<bool __icase, bool __collate>
00117         void
00118         _M_insert_char_matcher();
00119 
00120       template<bool __icase, bool __collate>
00121         void
00122         _M_insert_character_class_matcher();
00123 
00124       template<bool __icase, bool __collate>
00125         void
00126         _M_insert_bracket_matcher(bool __neg);
00127 
00128       // Returns true if successfully matched one term and should continue.
00129       // Returns false if the compiler should move on.
00130       template<bool __icase, bool __collate>
00131         bool
00132         _M_expression_term(pair<bool, _CharT>& __last_char,
00133                            _BracketMatcher<_TraitsT, __icase, __collate>&
00134                            __matcher);
00135 
00136       int
00137       _M_cur_int_value(int __radix);
00138 
00139       bool
00140       _M_try_char();
00141 
00142       _StateSeqT
00143       _M_pop()
00144       {
00145         auto ret = _M_stack.top();
00146         _M_stack.pop();
00147         return ret;
00148       }
00149 
00150       _FlagT              _M_flags;
00151       _ScannerT           _M_scanner;
00152       shared_ptr<_RegexT> _M_nfa;
00153       _StringT            _M_value;
00154       _StackT             _M_stack;
00155       const _TraitsT&     _M_traits;
00156       const _CtypeT&      _M_ctype;
00157     };
00158 
00159   template<typename _Tp>
00160     struct __has_contiguous_iter : std::false_type { };
00161 
00162   template<typename _Ch, typename _Tr, typename _Alloc>
00163     struct __has_contiguous_iter<std::basic_string<_Ch, _Tr, _Alloc>>
00164     : std::true_type
00165     { };
00166 
00167   template<typename _Tp, typename _Alloc>
00168     struct __has_contiguous_iter<std::vector<_Tp, _Alloc>>
00169     : std::true_type
00170     { };
00171 
00172   template<typename _Tp>
00173     struct __is_contiguous_normal_iter : std::false_type { };
00174 
00175   template<typename _CharT>
00176     struct __is_contiguous_normal_iter<_CharT*> : std::true_type { };
00177 
00178   template<typename _Tp, typename _Cont>
00179     struct
00180     __is_contiguous_normal_iter<__gnu_cxx::__normal_iterator<_Tp, _Cont>>
00181     : __has_contiguous_iter<_Cont>::type
00182     { };
00183 
00184   template<typename _Iter, typename _TraitsT>
00185     using __enable_if_contiguous_normal_iter
00186       = typename enable_if< __is_contiguous_normal_iter<_Iter>::value,
00187                            std::shared_ptr<const _NFA<_TraitsT>> >::type;
00188 
00189   template<typename _Iter, typename _TraitsT>
00190     using __disable_if_contiguous_normal_iter
00191       = typename enable_if< !__is_contiguous_normal_iter<_Iter>::value,
00192                            std::shared_ptr<const _NFA<_TraitsT>> >::type;
00193 
00194   template<typename _FwdIter, typename _TraitsT>
00195     inline __enable_if_contiguous_normal_iter<_FwdIter, _TraitsT>
00196     __compile_nfa(_FwdIter __first, _FwdIter __last,
00197                   const typename _TraitsT::locale_type& __loc,
00198                   regex_constants::syntax_option_type __flags)
00199     {
00200       size_t __len = __last - __first;
00201       const auto* __cfirst = __len ? std::__addressof(*__first) : nullptr;
00202       using _Cmplr = _Compiler<_TraitsT>;
00203       return _Cmplr(__cfirst, __cfirst + __len, __loc, __flags)._M_get_nfa();
00204     }
00205 
00206   template<typename _FwdIter, typename _TraitsT>
00207     inline __disable_if_contiguous_normal_iter<_FwdIter, _TraitsT>
00208     __compile_nfa(_FwdIter __first, _FwdIter __last,
00209                   const typename _TraitsT::locale_type& __loc,
00210                   regex_constants::syntax_option_type __flags)
00211     {
00212       using char_type = typename _TraitsT::char_type;
00213       const basic_string<char_type> __str(__first, __last);
00214       return __compile_nfa<const char_type*, _TraitsT>(__str.data(),
00215           __str.data() + __str.size(), __loc, __flags);
00216     }
00217 
00218   // [28.13.14]
00219   template<typename _TraitsT, bool __icase, bool __collate>
00220     class _RegexTranslatorBase
00221     {
00222     public:
00223       typedef typename _TraitsT::char_type            _CharT;
00224       typedef typename _TraitsT::string_type          _StringT;
00225       typedef _StringT _StrTransT;
00226 
00227       explicit
00228       _RegexTranslatorBase(const _TraitsT& __traits)
00229       : _M_traits(__traits)
00230       { }
00231 
00232       _CharT
00233       _M_translate(_CharT __ch) const
00234       {
00235         if (__icase)
00236           return _M_traits.translate_nocase(__ch);
00237         else if (__collate)
00238           return _M_traits.translate(__ch);
00239         else
00240           return __ch;
00241       }
00242 
00243       _StrTransT
00244       _M_transform(_CharT __ch) const
00245       {
00246         _StrTransT __str(1, __ch);
00247         return _M_traits.transform(__str.begin(), __str.end());
00248       }
00249 
00250       // See LWG 523. It's not efficiently implementable when _TraitsT is not
00251       // std::regex_traits<>, and __collate is true. See specializations for
00252       // implementations of other cases.
00253       bool
00254       _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
00255                      const _StrTransT& __s) const
00256       { return __first <= __s && __s <= __last; }
00257 
00258     protected:
00259       bool _M_in_range_icase(_CharT __first, _CharT __last, _CharT __ch) const
00260       {
00261         typedef std::ctype<_CharT> __ctype_type;
00262         const auto& __fctyp = use_facet<__ctype_type>(this->_M_traits.getloc());
00263         auto __lower = __fctyp.tolower(__ch);
00264         auto __upper = __fctyp.toupper(__ch);
00265         return (__first <= __lower && __lower <= __last)
00266           || (__first <= __upper && __upper <= __last);
00267       }
00268 
00269       const _TraitsT& _M_traits;
00270     };
00271 
00272   template<typename _TraitsT, bool __icase, bool __collate>
00273     class _RegexTranslator
00274     : public _RegexTranslatorBase<_TraitsT, __icase, __collate>
00275     {
00276     public:
00277       typedef _RegexTranslatorBase<_TraitsT, __icase, __collate> _Base;
00278       using _Base::_Base;
00279     };
00280 
00281   template<typename _TraitsT, bool __icase>
00282     class _RegexTranslator<_TraitsT, __icase, false>
00283     : public _RegexTranslatorBase<_TraitsT, __icase, false>
00284     {
00285     public:
00286       typedef _RegexTranslatorBase<_TraitsT, __icase, false> _Base;
00287       typedef typename _Base::_CharT _CharT;
00288       typedef _CharT _StrTransT;
00289 
00290       using _Base::_Base;
00291 
00292       _StrTransT
00293       _M_transform(_CharT __ch) const
00294       { return __ch; }
00295 
00296       bool
00297       _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
00298       {
00299         if (!__icase)
00300           return __first <= __ch && __ch <= __last;
00301         return this->_M_in_range_icase(__first, __last, __ch);
00302       }
00303     };
00304 
00305   template<typename _CharType>
00306     class _RegexTranslator<std::regex_traits<_CharType>, true, true>
00307     : public _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
00308     {
00309     public:
00310       typedef _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
00311         _Base;
00312       typedef typename _Base::_CharT _CharT;
00313       typedef typename _Base::_StrTransT _StrTransT;
00314 
00315       using _Base::_Base;
00316 
00317       bool
00318       _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
00319                      const _StrTransT& __str) const
00320       {
00321         __glibcxx_assert(__first.size() == 1);
00322         __glibcxx_assert(__last.size() == 1);
00323         __glibcxx_assert(__str.size() == 1);
00324         return this->_M_in_range_icase(__first[0], __last[0], __str[0]);
00325       }
00326     };
00327 
00328   template<typename _TraitsT>
00329     class _RegexTranslator<_TraitsT, false, false>
00330     {
00331     public:
00332       typedef typename _TraitsT::char_type _CharT;
00333       typedef _CharT                       _StrTransT;
00334 
00335       explicit
00336       _RegexTranslator(const _TraitsT&)
00337       { }
00338 
00339       _CharT
00340       _M_translate(_CharT __ch) const
00341       { return __ch; }
00342 
00343       _StrTransT
00344       _M_transform(_CharT __ch) const
00345       { return __ch; }
00346 
00347       bool
00348       _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
00349       { return __first <= __ch && __ch <= __last; }
00350     };
00351 
00352   template<typename _TraitsT, bool __is_ecma, bool __icase, bool __collate>
00353     struct _AnyMatcher;
00354 
00355   template<typename _TraitsT, bool __icase, bool __collate>
00356     struct _AnyMatcher<_TraitsT, false, __icase, __collate>
00357     {
00358       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00359       typedef typename _TransT::_CharT                       _CharT;
00360 
00361       explicit
00362       _AnyMatcher(const _TraitsT& __traits)
00363       : _M_translator(__traits)
00364       { }
00365 
00366       bool
00367       operator()(_CharT __ch) const
00368       {
00369         static auto __nul = _M_translator._M_translate('\0');
00370         return _M_translator._M_translate(__ch) != __nul;
00371       }
00372 
00373       _TransT _M_translator;
00374     };
00375 
00376   template<typename _TraitsT, bool __icase, bool __collate>
00377     struct _AnyMatcher<_TraitsT, true, __icase, __collate>
00378     {
00379       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00380       typedef typename _TransT::_CharT                       _CharT;
00381 
00382       explicit
00383       _AnyMatcher(const _TraitsT& __traits)
00384       : _M_translator(__traits)
00385       { }
00386 
00387       bool
00388       operator()(_CharT __ch) const
00389       { return _M_apply(__ch, typename is_same<_CharT, char>::type()); }
00390 
00391       bool
00392       _M_apply(_CharT __ch, true_type) const
00393       {
00394         auto __c = _M_translator._M_translate(__ch);
00395         auto __n = _M_translator._M_translate('\n');
00396         auto __r = _M_translator._M_translate('\r');
00397         return __c != __n && __c != __r;
00398       }
00399 
00400       bool
00401       _M_apply(_CharT __ch, false_type) const
00402       {
00403         auto __c = _M_translator._M_translate(__ch);
00404         auto __n = _M_translator._M_translate('\n');
00405         auto __r = _M_translator._M_translate('\r');
00406         auto __u2028 = _M_translator._M_translate(u'\u2028');
00407         auto __u2029 = _M_translator._M_translate(u'\u2029');
00408         return __c != __n && __c != __r && __c != __u2028 && __c != __u2029;
00409       }
00410 
00411       _TransT _M_translator;
00412     };
00413 
00414   template<typename _TraitsT, bool __icase, bool __collate>
00415     struct _CharMatcher
00416     {
00417       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00418       typedef typename _TransT::_CharT                       _CharT;
00419 
00420       _CharMatcher(_CharT __ch, const _TraitsT& __traits)
00421       : _M_translator(__traits), _M_ch(_M_translator._M_translate(__ch))
00422       { }
00423 
00424       bool
00425       operator()(_CharT __ch) const
00426       { return _M_ch == _M_translator._M_translate(__ch); }
00427 
00428       _TransT _M_translator;
00429       _CharT  _M_ch;
00430     };
00431 
00432   /// Matches a character range (bracket expression)
00433   template<typename _TraitsT, bool __icase, bool __collate>
00434     struct _BracketMatcher
00435     {
00436     public:
00437       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00438       typedef typename _TransT::_CharT                       _CharT;
00439       typedef typename _TransT::_StrTransT                   _StrTransT;
00440       typedef typename _TraitsT::string_type                 _StringT;
00441       typedef typename _TraitsT::char_class_type             _CharClassT;
00442 
00443     public:
00444       _BracketMatcher(bool __is_non_matching,
00445                       const _TraitsT& __traits)
00446       : _M_class_set(0), _M_translator(__traits), _M_traits(__traits),
00447       _M_is_non_matching(__is_non_matching)
00448       { }
00449 
00450       bool
00451       operator()(_CharT __ch) const
00452       {
00453         _GLIBCXX_DEBUG_ASSERT(_M_is_ready);
00454         return _M_apply(__ch, _UseCache());
00455       }
00456 
00457       void
00458       _M_add_char(_CharT __c)
00459       {
00460         _M_char_set.push_back(_M_translator._M_translate(__c));
00461         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00462       }
00463 
00464       _StringT
00465       _M_add_collate_element(const _StringT& __s)
00466       {
00467         auto __st = _M_traits.lookup_collatename(__s.data(),
00468                                                  __s.data() + __s.size());
00469         if (__st.empty())
00470           __throw_regex_error(regex_constants::error_collate,
00471                               "Invalid collate element.");
00472         _M_char_set.push_back(_M_translator._M_translate(__st[0]));
00473         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00474         return __st;
00475       }
00476 
00477       void
00478       _M_add_equivalence_class(const _StringT& __s)
00479       {
00480         auto __st = _M_traits.lookup_collatename(__s.data(),
00481                                                  __s.data() + __s.size());
00482         if (__st.empty())
00483           __throw_regex_error(regex_constants::error_collate,
00484                               "Invalid equivalence class.");
00485         __st = _M_traits.transform_primary(__st.data(),
00486                                            __st.data() + __st.size());
00487         _M_equiv_set.push_back(__st);
00488         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00489       }
00490 
00491       // __neg should be true for \D, \S and \W only.
00492       void
00493       _M_add_character_class(const _StringT& __s, bool __neg)
00494       {
00495         auto __mask = _M_traits.lookup_classname(__s.data(),
00496                                                  __s.data() + __s.size(),
00497                                                  __icase);
00498         if (__mask == 0)
00499           __throw_regex_error(regex_constants::error_collate,
00500                               "Invalid character class.");
00501         if (!__neg)
00502           _M_class_set |= __mask;
00503         else
00504           _M_neg_class_set.push_back(__mask);
00505         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00506       }
00507 
00508       void
00509       _M_make_range(_CharT __l, _CharT __r)
00510       {
00511         if (__l > __r)
00512           __throw_regex_error(regex_constants::error_range,
00513                               "Invalid range in bracket expression.");
00514         _M_range_set.push_back(make_pair(_M_translator._M_transform(__l),
00515                                          _M_translator._M_transform(__r)));
00516         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00517       }
00518 
00519       void
00520       _M_ready()
00521       {
00522         std::sort(_M_char_set.begin(), _M_char_set.end());
00523         auto __end = std::unique(_M_char_set.begin(), _M_char_set.end());
00524         _M_char_set.erase(__end, _M_char_set.end());
00525         _M_make_cache(_UseCache());
00526         _GLIBCXX_DEBUG_ONLY(_M_is_ready = true);
00527       }
00528 
00529     private:
00530       // Currently we only use the cache for char
00531       typedef typename std::is_same<_CharT, char>::type _UseCache;
00532 
00533       static constexpr size_t
00534       _S_cache_size()
00535       {
00536         return 1ul << (sizeof(_CharT) * __CHAR_BIT__ * int(_UseCache::value));
00537       }
00538 
00539       struct _Dummy { };
00540       typedef typename std::conditional<_UseCache::value,
00541                                         std::bitset<_S_cache_size()>,
00542                                         _Dummy>::type _CacheT;
00543       typedef typename std::make_unsigned<_CharT>::type _UnsignedCharT;
00544 
00545       bool
00546       _M_apply(_CharT __ch, false_type) const;
00547 
00548       bool
00549       _M_apply(_CharT __ch, true_type) const
00550       { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; }
00551 
00552       void
00553       _M_make_cache(true_type)
00554       {
00555         for (unsigned __i = 0; __i < _M_cache.size(); __i++)
00556           _M_cache[__i] = _M_apply(static_cast<_CharT>(__i), false_type());
00557       }
00558 
00559       void
00560       _M_make_cache(false_type)
00561       { }
00562 
00563     private:
00564       std::vector<_CharT>                       _M_char_set;
00565       std::vector<_StringT>                     _M_equiv_set;
00566       std::vector<pair<_StrTransT, _StrTransT>> _M_range_set;
00567       std::vector<_CharClassT>                  _M_neg_class_set;
00568       _CharClassT                               _M_class_set;
00569       _TransT                                   _M_translator;
00570       const _TraitsT&                           _M_traits;
00571       bool                                      _M_is_non_matching;
00572       _CacheT                                   _M_cache;
00573 #ifdef _GLIBCXX_DEBUG
00574       bool                                      _M_is_ready = false;
00575 #endif
00576     };
00577 
00578  //@} regex-detail
00579 _GLIBCXX_END_NAMESPACE_VERSION
00580 } // namespace __detail
00581 } // namespace std
00582 
00583 #include <bits/regex_compiler.tcc>