@@ -382,7 +382,10 @@ public:
382382
383383 template <class _FwdIt>
384384 string_type lookup_collatename(_FwdIt _First, _FwdIt _Last) const { // map [_First, _Last) to collation element
385- return string_type{_First, _Last};
385+ if (_First != _Last && _STD next(_First) == _Last) {
386+ return string_type{_First, _Last};
387+ }
388+ return string_type{};
386389 }
387390
388391 locale_type imbue(locale_type _Lx) { // store locale object
@@ -1507,8 +1510,6 @@ public:
15071510template <class _FwdIt, class _Elem, class _RxTraits>
15081511class _Builder { // provides operations used by _Parser to build the nfa
15091512public:
1510- using _Difft = typename iterator_traits<_FwdIt>::difference_type;
1511-
15121513 _Builder(const _RxTraits& _Tr, regex_constants::syntax_option_type);
15131514 void _Setlong();
15141515 // _Discard_pattern is an ABI zombie name
@@ -1525,8 +1526,8 @@ public:
15251526 void _Add_char_to_class(_Elem _Ch);
15261527 void _Add_range2(_Elem, _Elem);
15271528 void _Add_named_class(typename _RxTraits::char_class_type, bool);
1528- void _Add_equiv(_FwdIt, _FwdIt, _Difft );
1529- void _Add_coll(_FwdIt, _FwdIt, _Difft );
1529+ void _Add_equiv2(const _Elem*, const _Elem* );
1530+ void _Add_coll2(const _Elem*, const _Elem* );
15301531 _Node_base* _Begin_group();
15311532 void _End_group(_Node_base* _Back);
15321533 _Node_base* _Begin_assert_group(bool);
@@ -1547,7 +1548,7 @@ private:
15471548 void _Add_char_to_bitmap(_Elem _Ch);
15481549 void _Add_char_to_array(_Elem _Ch);
15491550 void _Add_elts(_Node_class<_Elem, _RxTraits>*, typename _RxTraits::char_class_type, bool);
1550- void _Char_to_elts(_FwdIt, _FwdIt, _Difft , _Sequence<_Elem>**);
1551+ void _Char_to_elts2(const _Elem*, const _Elem* , _Sequence<_Elem>**);
15511552
15521553 _Root_node* _Root;
15531554 _Node_base* _Current;
@@ -1733,7 +1734,7 @@ private:
17331734 bool _DecimalDigits3(regex_constants::error_type _Error_type, int _Initial = 0);
17341735 void _HexDigits(int);
17351736 bool _OctalDigits();
1736- void _Do_ex_class (_Meta_type);
1737+ _Prs_ret _Do_ex_class2 (_Meta_type);
17371738 bool _CharacterClassEscape(bool);
17381739 _Prs_ret _ClassEscape3();
17391740 _Prs_ret _ClassAtom(bool);
@@ -1752,6 +1753,7 @@ private:
17521753 void _Quantifier();
17531754 bool _Alternative();
17541755 void _Disjunction();
1756+ void _Calculate_loop_simplicity(_Node_base* _Nx, _Node_base* _Ne, _Node_rep* _Outer_rep);
17551757
17561758 _FwdIt _Pat;
17571759 _FwdIt _Begin;
@@ -2952,16 +2954,17 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_named_class(typename _RxTraits::ch
29522954}
29532955
29542956template <class _FwdIt, class _Elem, class _RxTraits>
2955- void _Builder<_FwdIt, _Elem, _RxTraits>::_Char_to_elts(_FwdIt _First, _FwdIt _Last, _Difft _Diff ,
2957+ void _Builder<_FwdIt, _Elem, _RxTraits>::_Char_to_elts2(const _Elem* const _First, const _Elem* const _Last ,
29562958 _Sequence<_Elem>** _Cur) { // add collation element to element sequence
2957- while (*_Cur && static_cast<unsigned int>(_Diff) < (*_Cur)->_Sz) {
2959+ auto _Diff = static_cast<unsigned int>(_Last - _First);
2960+ while (*_Cur && _Diff < (*_Cur)->_Sz) {
29582961 _Cur = &(*_Cur)->_Next;
29592962 }
29602963
2961- if (!(*_Cur) || static_cast<unsigned int>( _Diff) != (*_Cur)->_Sz) {
2964+ if (!(*_Cur) || _Diff != (*_Cur)->_Sz) {
29622965 // add new sequence holding elements of the same length
29632966 _Sequence<_Elem>* _Node = *_Cur;
2964- *_Cur = new _Sequence<_Elem>(static_cast<unsigned int>( _Diff) );
2967+ *_Cur = new _Sequence<_Elem>(_Diff);
29652968 (*_Cur)->_Next = _Node;
29662969 }
29672970 (*_Cur)->_Data._Insert2(_First, _Last);
@@ -2978,10 +2981,15 @@ unsigned int _Builder<_FwdIt, _Elem, _RxTraits>::_Get_tmax() const {
29782981}
29792982
29802983template <class _FwdIt, class _Elem, class _RxTraits>
2981- void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_equiv(_FwdIt _First, _FwdIt _Last, _Difft _Diff ) {
2984+ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_equiv2(const _Elem* const _First, const _Elem* const _Last ) {
29822985 // add elements of equivalence class to bracket expression
29832986 _Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Current);
29842987 typename _RxTraits::string_type _Str = _Traits.transform_primary(_First, _Last);
2988+
2989+ if (_Str.empty()) {
2990+ _Xregex_error(regex_constants::error_collate);
2991+ }
2992+
29852993 for (unsigned int _Ch = 0; _Ch < _Bmp_max; ++_Ch) { // add elements
29862994 _Elem _Ex = static_cast<_Elem>(_Ch);
29872995 if (_Traits.transform_primary(_STD addressof(_Ex), _STD addressof(_Ex) + 1)
@@ -2995,16 +3003,16 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_equiv(_FwdIt _First, _FwdIt _Last,
29953003 }
29963004 if (_Bmp_max < static_cast<unsigned int>(_STD _Max_limit<_Elem>())) { // map range
29973005 _Sequence<_Elem>** _Cur = _STD addressof(_Node->_Equiv);
2998- _Char_to_elts (_First, _Last, _Diff , _Cur);
3006+ _Char_to_elts2 (_First, _Last, _Cur);
29993007 }
30003008}
30013009
30023010template <class _FwdIt, class _Elem, class _RxTraits>
3003- void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_coll(_FwdIt _First, _FwdIt _Last, _Difft _Diff ) {
3011+ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_coll2(const _Elem* const _First, const _Elem* const _Last ) {
30043012 // add collation element to bracket expression
30053013 _Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Current);
30063014 _Sequence<_Elem>** _Cur = _STD addressof(_Node->_Coll);
3007- _Char_to_elts (_First, _Last, _Diff , _Cur);
3015+ _Char_to_elts2 (_First, _Last, _Cur);
30083016}
30093017
30103018template <class _FwdIt, class _Elem, class _RxTraits>
@@ -3399,11 +3407,11 @@ bool _Lookup_collating_range(const _Elem _Ch, const _Buf<_Elem>* const _Bufptr,
33993407}
34003408
34013409template <class _Elem, class _RxTraits>
3402- bool _Lookup_equiv(typename _RxTraits::_Uelem _Ch, const _Sequence<_Elem>* _Eq, const _RxTraits& _Traits) {
3410+ bool _Lookup_equiv2(_Elem _Ch, const _Sequence<_Elem>* _Eq, const _RxTraits& _Traits) {
34033411 // check whether _Ch is in _Eq
34043412 typename _RxTraits::string_type _Str0;
34053413 typename _RxTraits::string_type _Str1;
3406- _Str1.push_back(static_cast<_Elem>( _Ch) );
3414+ _Str1.push_back(_Ch);
34073415 _Str1 = _Traits.transform_primary(_Str1.begin(), _Str1.end());
34083416 while (_Eq) { // look for sequence of elements that are the right size
34093417 for (unsigned int _Ix = 0; _Ix < _Eq->_Data._Size(); _Ix += _Eq->_Sz) { // look for _Ch
@@ -3418,22 +3426,48 @@ bool _Lookup_equiv(typename _RxTraits::_Uelem _Ch, const _Sequence<_Elem>* _Eq,
34183426 return false;
34193427}
34203428
3421- template <class _BidIt, class _Elem>
3422- _BidIt _Lookup_coll(_BidIt _First, _BidIt _Last, const _Sequence<_Elem>* _Eq) {
3423- // look for collation element [_First, _Last) in _Eq
3424- while (_Eq) { // look for sequence of elements that are the right size
3425- for (unsigned int _Ix = 0; _Ix < _Eq->_Data._Size(); _Ix += _Eq->_Sz) { // look for character range
3426- _BidIt _Res = _First;
3427- for (size_t _Jx = 0; _Jx < _Eq->_Sz; ++_Jx) { // check current character
3428- if (*_Res++ != *(_Eq->_Data._Str() + _Ix + _Jx)) {
3429- break;
3430- }
3429+ template <class _BidIt, class _Elem, class _RxTraits>
3430+ _BidIt _Lookup_coll2(_Elem _First_ch, _BidIt _First, const _BidIt _Last, const _Sequence<_Elem>* _Seq,
3431+ const _RxTraits& _Traits, const regex_constants::syntax_option_type _Flags) {
3432+ // look for collation element [_First, _Last) in _Seq
3433+ typename _RxTraits::string_type _Str;
3434+
3435+ // extend translated input character sequence
3436+ if (_Seq) { // the longest collating elements come first
3437+ _Str.push_back(_First_ch);
3438+ const auto _Coll_size = _Seq->_Sz;
3439+ size_t _Str_size = 1;
3440+ _BidIt _Pos = _First;
3441+ ++_Pos;
3442+
3443+ for (; _Str_size < _Coll_size && _Pos != _Last; ++_Pos) {
3444+ _Elem _Ch = *_Pos;
3445+ if (_Flags & regex_constants::icase) {
3446+ _Ch = _Traits.translate_nocase(_Ch);
3447+ } else if (_Flags & regex_constants::collate) {
3448+ _Ch = _Traits.translate(_Ch);
34313449 }
3432- if (_Res == _Last) {
3433- return _Last;
3450+ _Str.push_back(_Ch);
3451+ ++_Str_size;
3452+ }
3453+ }
3454+
3455+ while (_Seq) { // look for sequence of elements that are the right size
3456+ const auto _Size = _Seq->_Sz;
3457+
3458+ // match input character sequence to stored collating elements
3459+ if (_Str.size() >= _Size) {
3460+ const _Elem* const _Str_first = _Str.data();
3461+ const _Elem* const _Str_last = _Str_first + _Size;
3462+ const _Elem* _Current = _Seq->_Data._Str();
3463+ for (auto _Remaining = _Seq->_Data._Size(); _Remaining >= _Size; _Current += _Size, _Remaining -= _Size) {
3464+ if (_STD equal(_Str_first, _Str_last, _Current)) {
3465+ _STD advance(_First, static_cast<_Iter_diff_t<_BidIt>>(_Size));
3466+ return _First;
3467+ }
34343468 }
34353469 }
3436- _Eq = _Eq ->_Next;
3470+ _Seq = _Seq ->_Next;
34373471 }
34383472 return _First;
34393473}
@@ -3454,7 +3488,7 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap
34543488 _It _Resx;
34553489 _Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Nx);
34563490 if (_Node->_Coll
3457- && (_Resx = _STD _Lookup_coll( _Tgt_state._Cur, _End, _Node->_Coll))
3491+ && (_Resx = _STD _Lookup_coll2(_Ch, _Tgt_state._Cur, _End, _Node->_Coll, _Traits, _Sflags ))
34583492 != _Tgt_state._Cur) { // check for collation element
34593493 _Res0 = _Resx;
34603494 _Found = true;
@@ -3470,7 +3504,7 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap
34703504 _Found = true;
34713505 } else if (_Node->_Classes != typename _RxTraits::char_class_type{} && _Traits.isctype(_Ch, _Node->_Classes)) {
34723506 _Found = true;
3473- } else if (_Node->_Equiv && _STD _Lookup_equiv(_UCh , _Node->_Equiv, _Traits)) {
3507+ } else if (_Node->_Equiv && _STD _Lookup_equiv2(_Ch , _Node->_Equiv, _Traits)) {
34743508 _Found = true;
34753509 } else {
34763510 _Found = false;
@@ -3811,10 +3845,9 @@ _BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt
38113845 const auto _UCh = static_cast<typename _RxTraits::_Uelem>(_Ch);
38123846
38133847 _Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Nx);
3814- _It _Next = _First_arg;
3815- ++_Next;
38163848
3817- if (_Node->_Coll && _STD _Lookup_coll(_First_arg, _Next, _Node->_Coll) != _First_arg) {
3849+ if (_Node->_Coll
3850+ && _STD _Lookup_coll2(_Ch, _First_arg, _Last, _Node->_Coll, _Traits, _Sflags) != _First_arg) {
38183851 _Found = true;
38193852 } else if (_Node->_Ranges
38203853 && (_Sflags & regex_constants::collate
@@ -3830,7 +3863,7 @@ _BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt
38303863 } else if (_Node->_Classes != typename _RxTraits::char_class_type{}
38313864 && _Traits.isctype(_Ch, _Node->_Classes)) {
38323865 _Found = true;
3833- } else if (_Node->_Equiv && _STD _Lookup_equiv(_UCh , _Node->_Equiv, _Traits)) {
3866+ } else if (_Node->_Equiv && _STD _Lookup_equiv2(_Ch , _Node->_Equiv, _Traits)) {
38343867 _Found = true;
38353868 } else {
38363869 _Found = false;
@@ -4074,45 +4107,68 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_OctalDigits() { // check for up to 3 oc
40744107}
40754108
40764109template <class _FwdIt, class _Elem, class _RxTraits>
4077- void _Parser<_FwdIt, _Elem, _RxTraits>::_Do_ex_class (
4110+ _Prs_ret _Parser<_FwdIt, _Elem, _RxTraits>::_Do_ex_class2 (
40784111 _Meta_type _End_arg) { // handle delimited expressions within bracket expression
4079- regex_constants::error_type _Errtype = (_End_arg == _Meta_colon ? regex_constants::error_ctype
4080- : _End_arg == _Meta_equal ? regex_constants::error_collate
4081- : _End_arg == _Meta_dot ? regex_constants::error_collate
4082- : regex_constants::error_syntax);
4083- _FwdIt _Beg = _Pat;
4084- _Iter_diff_t<_FwdIt> _Diff = 0;
4112+ const regex_constants::error_type _Errtype =
4113+ _End_arg == _Meta_colon ? regex_constants::error_ctype : regex_constants::error_collate;
4114+ const _FwdIt _Beg = _Pat;
40854115
40864116 while (_Mchar != _Meta_colon && _Mchar != _Meta_equal && _Mchar != _Meta_dot && _Mchar != _Meta_eos) {
40874117 // advance to end delimiter
40884118 _Next();
4089- ++_Diff;
40904119 }
4091- if (_Mchar != _End_arg) {
4092- _Error(_Errtype);
4093- } else if (_End_arg == _Meta_colon) { // handle named character class
4120+
4121+ const _FwdIt _End = _Pat;
4122+ _Expect(_End_arg, _Errtype);
4123+ _Expect(_Meta_rsq, _Errtype);
4124+
4125+ if (_End_arg == _Meta_colon) { // handle named character class
40944126 typename _RxTraits::char_class_type _Cls =
4095- _Traits.lookup_classname(_Beg, _Pat , (_Flags & regex_constants::icase) != 0);
4127+ _Traits.lookup_classname(_Beg, _End , (_Flags & regex_constants::icase) != 0);
40964128 if (!_Cls) {
40974129 _Error(regex_constants::error_ctype);
40984130 }
40994131
41004132 _Nfa._Add_named_class(_Cls, false);
4101- } else if (_End_arg == _Meta_equal) { // process =
4102- if (_Beg == _Pat) {
4133+ return _Prs_set;
4134+ } else {
4135+ typename _RxTraits::string_type _Coll_elem = _Traits.lookup_collatename(_Beg, _End);
4136+ const auto _Size = _Coll_elem.size();
4137+
4138+ if (_Size == 0) {
41034139 _Error(regex_constants::error_collate);
4104- } else {
4105- _Nfa._Add_equiv(_Beg, _Pat, _Diff);
41064140 }
4107- } else if (_End_arg == _Meta_dot) { // process .
4108- if (_Beg == _Pat) {
4109- _Error(regex_constants::error_collate);
4110- } else {
4111- _Nfa._Add_coll(_Beg, _Pat, _Diff);
4141+
4142+ if (_Size > _Max_limit<unsigned int>()) {
4143+ _Error(regex_constants::error_space);
4144+ }
4145+
4146+ _Elem* const _Coll_elem_first = &_Coll_elem.front();
4147+ const _Elem* const _Coll_elem_last = _Coll_elem_first + _Size;
4148+ if (_End_arg == _Meta_equal) { // process equivalence
4149+ _Nfa._Add_equiv2(_Coll_elem_first, _Coll_elem_last);
4150+ return _Prs_set;
4151+ } else { // process collating element
4152+ if (_Size == 1) {
4153+ _Val = *_Coll_elem_first;
4154+ return _Prs_chr;
4155+ }
4156+
4157+ // Character ranges with multi-character bounds cannot be represented in NFA nodes yet (see GH-5391).
4158+ // Provisionally treat multi-character collating elements as character sets.
4159+ if (_Flags & regex_constants::icase) {
4160+ for (auto _Current = _Coll_elem_first; _Current != _Coll_elem_last; ++_Current) {
4161+ *_Current = _Traits.translate_nocase(*_Current);
4162+ }
4163+ } else if (_Flags & regex_constants::collate) {
4164+ for (auto _Current = _Coll_elem_first; _Current != _Coll_elem_last; ++_Current) {
4165+ *_Current = _Traits.translate(*_Current);
4166+ }
4167+ }
4168+ _Nfa._Add_coll2(_Coll_elem_first, _Coll_elem_last);
4169+ return _Prs_set;
41124170 }
41134171 }
4114- _Next();
4115- _Expect(_Meta_rsq, _Errtype);
41164172}
41174173
41184174template <class _FwdIt, class _Elem, class _RxTraits>
@@ -4172,8 +4228,7 @@ _Prs_ret _Parser<_FwdIt, _Elem, _RxTraits>::_ClassAtom(const bool _Initial) { //
41724228 if (_Mchar == _Meta_colon || _Mchar == _Meta_equal || _Mchar == _Meta_dot) { // handle delimited expression
41734229 _Meta_type _St = _Mchar;
41744230 _Next();
4175- _Do_ex_class(_St);
4176- return _Prs_set;
4231+ return _Do_ex_class2(_St);
41774232 } else { // handle ordinary [
41784233 _Val = _Meta_lsq;
41794234 return _Prs_chr;
@@ -4621,7 +4676,9 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Disjunction() { // check for valid disj
46214676 }
46224677}
46234678
4624- inline void _Calculate_loop_simplicity(_Node_base* _Nx, _Node_base* _Ne, _Node_rep* _Outer_rep) {
4679+ template <class _FwdIt, class _Elem, class _RxTraits>
4680+ void _Parser<_FwdIt, _Elem, _RxTraits>::_Calculate_loop_simplicity(
4681+ _Node_base* _Nx, _Node_base* _Ne, _Node_rep* _Outer_rep) {
46254682 // walks regex NFA, calculates values of _Node_rep::_Simple_loop
46264683 for (; _Nx != _Ne && _Nx; _Nx = _Nx->_Next) {
46274684 switch (_Nx->_Kind) {
@@ -4662,14 +4719,26 @@ inline void _Calculate_loop_simplicity(_Node_base* _Nx, _Node_base* _Ne, _Node_r
46624719 _Outer_rep = nullptr;
46634720 }
46644721 break;
4722+ case _N_class:
4723+ if (_Outer_rep) {
4724+ // _Node_rep is not simple if a class can match character sequences of different lengths
4725+ auto _Node = static_cast<const _Node_class<_Elem, _RxTraits>*>(_Nx);
4726+ bool _Coll_diff_size =
4727+ _Node->_Coll
4728+ && (_Node->_Small || _Node->_Large || _Node->_Ranges || _Node->_Classes || _Node->_Coll->_Next);
4729+ if (_Coll_diff_size || _Node->_Equiv
4730+ || ((_Flags & regex_constants::collate) && (_Node->_Ranges || (_Node->_Flags & _Fl_negate)))) {
4731+ _Outer_rep->_Simple_loop = 0;
4732+ }
4733+ }
4734+ break;
46654735 case _N_none:
46664736 case _N_nop:
46674737 case _N_bol:
46684738 case _N_eol:
46694739 case _N_wbound:
46704740 case _N_dot:
46714741 case _N_str:
4672- case _N_class:
46734742 case _N_group:
46744743 case _N_end_group:
46754744 case _N_end_assert:
0 commit comments