Skip to content

Commit 81056a9

Browse files
<regex>: Properly parse and match collating symbols and equivalences (#5392)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent b0bd6a7 commit 81056a9

File tree

5 files changed

+369
-65
lines changed

5 files changed

+369
-65
lines changed

stl/inc/regex

Lines changed: 132 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,10 @@ public:
382382

383383
template <class _FwdIt>
384384
string_type lookup_collatename(_FwdIt _First, _FwdIt _Last) const { // map [_First, _Last) to collation element
385-
return string_type{_First, _Last};
385+
if (_First != _Last && _STD next(_First) == _Last) {
386+
return string_type{_First, _Last};
387+
}
388+
return string_type{};
386389
}
387390

388391
locale_type imbue(locale_type _Lx) { // store locale object
@@ -1507,8 +1510,6 @@ public:
15071510
template <class _FwdIt, class _Elem, class _RxTraits>
15081511
class _Builder { // provides operations used by _Parser to build the nfa
15091512
public:
1510-
using _Difft = typename iterator_traits<_FwdIt>::difference_type;
1511-
15121513
_Builder(const _RxTraits& _Tr, regex_constants::syntax_option_type);
15131514
void _Setlong();
15141515
// _Discard_pattern is an ABI zombie name
@@ -1525,8 +1526,8 @@ public:
15251526
void _Add_char_to_class(_Elem _Ch);
15261527
void _Add_range2(_Elem, _Elem);
15271528
void _Add_named_class(typename _RxTraits::char_class_type, bool);
1528-
void _Add_equiv(_FwdIt, _FwdIt, _Difft);
1529-
void _Add_coll(_FwdIt, _FwdIt, _Difft);
1529+
void _Add_equiv2(const _Elem*, const _Elem*);
1530+
void _Add_coll2(const _Elem*, const _Elem*);
15301531
_Node_base* _Begin_group();
15311532
void _End_group(_Node_base* _Back);
15321533
_Node_base* _Begin_assert_group(bool);
@@ -1547,7 +1548,7 @@ private:
15471548
void _Add_char_to_bitmap(_Elem _Ch);
15481549
void _Add_char_to_array(_Elem _Ch);
15491550
void _Add_elts(_Node_class<_Elem, _RxTraits>*, typename _RxTraits::char_class_type, bool);
1550-
void _Char_to_elts(_FwdIt, _FwdIt, _Difft, _Sequence<_Elem>**);
1551+
void _Char_to_elts2(const _Elem*, const _Elem*, _Sequence<_Elem>**);
15511552

15521553
_Root_node* _Root;
15531554
_Node_base* _Current;
@@ -1733,7 +1734,7 @@ private:
17331734
bool _DecimalDigits3(regex_constants::error_type _Error_type, int _Initial = 0);
17341735
void _HexDigits(int);
17351736
bool _OctalDigits();
1736-
void _Do_ex_class(_Meta_type);
1737+
_Prs_ret _Do_ex_class2(_Meta_type);
17371738
bool _CharacterClassEscape(bool);
17381739
_Prs_ret _ClassEscape3();
17391740
_Prs_ret _ClassAtom(bool);
@@ -1752,6 +1753,7 @@ private:
17521753
void _Quantifier();
17531754
bool _Alternative();
17541755
void _Disjunction();
1756+
void _Calculate_loop_simplicity(_Node_base* _Nx, _Node_base* _Ne, _Node_rep* _Outer_rep);
17551757

17561758
_FwdIt _Pat;
17571759
_FwdIt _Begin;
@@ -2952,16 +2954,17 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_named_class(typename _RxTraits::ch
29522954
}
29532955

29542956
template <class _FwdIt, class _Elem, class _RxTraits>
2955-
void _Builder<_FwdIt, _Elem, _RxTraits>::_Char_to_elts(_FwdIt _First, _FwdIt _Last, _Difft _Diff,
2957+
void _Builder<_FwdIt, _Elem, _RxTraits>::_Char_to_elts2(const _Elem* const _First, const _Elem* const _Last,
29562958
_Sequence<_Elem>** _Cur) { // add collation element to element sequence
2957-
while (*_Cur && static_cast<unsigned int>(_Diff) < (*_Cur)->_Sz) {
2959+
auto _Diff = static_cast<unsigned int>(_Last - _First);
2960+
while (*_Cur && _Diff < (*_Cur)->_Sz) {
29582961
_Cur = &(*_Cur)->_Next;
29592962
}
29602963

2961-
if (!(*_Cur) || static_cast<unsigned int>(_Diff) != (*_Cur)->_Sz) {
2964+
if (!(*_Cur) || _Diff != (*_Cur)->_Sz) {
29622965
// add new sequence holding elements of the same length
29632966
_Sequence<_Elem>* _Node = *_Cur;
2964-
*_Cur = new _Sequence<_Elem>(static_cast<unsigned int>(_Diff));
2967+
*_Cur = new _Sequence<_Elem>(_Diff);
29652968
(*_Cur)->_Next = _Node;
29662969
}
29672970
(*_Cur)->_Data._Insert2(_First, _Last);
@@ -2978,10 +2981,15 @@ unsigned int _Builder<_FwdIt, _Elem, _RxTraits>::_Get_tmax() const {
29782981
}
29792982

29802983
template <class _FwdIt, class _Elem, class _RxTraits>
2981-
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_equiv(_FwdIt _First, _FwdIt _Last, _Difft _Diff) {
2984+
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_equiv2(const _Elem* const _First, const _Elem* const _Last) {
29822985
// add elements of equivalence class to bracket expression
29832986
_Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Current);
29842987
typename _RxTraits::string_type _Str = _Traits.transform_primary(_First, _Last);
2988+
2989+
if (_Str.empty()) {
2990+
_Xregex_error(regex_constants::error_collate);
2991+
}
2992+
29852993
for (unsigned int _Ch = 0; _Ch < _Bmp_max; ++_Ch) { // add elements
29862994
_Elem _Ex = static_cast<_Elem>(_Ch);
29872995
if (_Traits.transform_primary(_STD addressof(_Ex), _STD addressof(_Ex) + 1)
@@ -2995,16 +3003,16 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_equiv(_FwdIt _First, _FwdIt _Last,
29953003
}
29963004
if (_Bmp_max < static_cast<unsigned int>(_STD _Max_limit<_Elem>())) { // map range
29973005
_Sequence<_Elem>** _Cur = _STD addressof(_Node->_Equiv);
2998-
_Char_to_elts(_First, _Last, _Diff, _Cur);
3006+
_Char_to_elts2(_First, _Last, _Cur);
29993007
}
30003008
}
30013009

30023010
template <class _FwdIt, class _Elem, class _RxTraits>
3003-
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_coll(_FwdIt _First, _FwdIt _Last, _Difft _Diff) {
3011+
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_coll2(const _Elem* const _First, const _Elem* const _Last) {
30043012
// add collation element to bracket expression
30053013
_Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Current);
30063014
_Sequence<_Elem>** _Cur = _STD addressof(_Node->_Coll);
3007-
_Char_to_elts(_First, _Last, _Diff, _Cur);
3015+
_Char_to_elts2(_First, _Last, _Cur);
30083016
}
30093017

30103018
template <class _FwdIt, class _Elem, class _RxTraits>
@@ -3399,11 +3407,11 @@ bool _Lookup_collating_range(const _Elem _Ch, const _Buf<_Elem>* const _Bufptr,
33993407
}
34003408

34013409
template <class _Elem, class _RxTraits>
3402-
bool _Lookup_equiv(typename _RxTraits::_Uelem _Ch, const _Sequence<_Elem>* _Eq, const _RxTraits& _Traits) {
3410+
bool _Lookup_equiv2(_Elem _Ch, const _Sequence<_Elem>* _Eq, const _RxTraits& _Traits) {
34033411
// check whether _Ch is in _Eq
34043412
typename _RxTraits::string_type _Str0;
34053413
typename _RxTraits::string_type _Str1;
3406-
_Str1.push_back(static_cast<_Elem>(_Ch));
3414+
_Str1.push_back(_Ch);
34073415
_Str1 = _Traits.transform_primary(_Str1.begin(), _Str1.end());
34083416
while (_Eq) { // look for sequence of elements that are the right size
34093417
for (unsigned int _Ix = 0; _Ix < _Eq->_Data._Size(); _Ix += _Eq->_Sz) { // look for _Ch
@@ -3418,22 +3426,48 @@ bool _Lookup_equiv(typename _RxTraits::_Uelem _Ch, const _Sequence<_Elem>* _Eq,
34183426
return false;
34193427
}
34203428

3421-
template <class _BidIt, class _Elem>
3422-
_BidIt _Lookup_coll(_BidIt _First, _BidIt _Last, const _Sequence<_Elem>* _Eq) {
3423-
// look for collation element [_First, _Last) in _Eq
3424-
while (_Eq) { // look for sequence of elements that are the right size
3425-
for (unsigned int _Ix = 0; _Ix < _Eq->_Data._Size(); _Ix += _Eq->_Sz) { // look for character range
3426-
_BidIt _Res = _First;
3427-
for (size_t _Jx = 0; _Jx < _Eq->_Sz; ++_Jx) { // check current character
3428-
if (*_Res++ != *(_Eq->_Data._Str() + _Ix + _Jx)) {
3429-
break;
3430-
}
3429+
template <class _BidIt, class _Elem, class _RxTraits>
3430+
_BidIt _Lookup_coll2(_Elem _First_ch, _BidIt _First, const _BidIt _Last, const _Sequence<_Elem>* _Seq,
3431+
const _RxTraits& _Traits, const regex_constants::syntax_option_type _Flags) {
3432+
// look for collation element [_First, _Last) in _Seq
3433+
typename _RxTraits::string_type _Str;
3434+
3435+
// extend translated input character sequence
3436+
if (_Seq) { // the longest collating elements come first
3437+
_Str.push_back(_First_ch);
3438+
const auto _Coll_size = _Seq->_Sz;
3439+
size_t _Str_size = 1;
3440+
_BidIt _Pos = _First;
3441+
++_Pos;
3442+
3443+
for (; _Str_size < _Coll_size && _Pos != _Last; ++_Pos) {
3444+
_Elem _Ch = *_Pos;
3445+
if (_Flags & regex_constants::icase) {
3446+
_Ch = _Traits.translate_nocase(_Ch);
3447+
} else if (_Flags & regex_constants::collate) {
3448+
_Ch = _Traits.translate(_Ch);
34313449
}
3432-
if (_Res == _Last) {
3433-
return _Last;
3450+
_Str.push_back(_Ch);
3451+
++_Str_size;
3452+
}
3453+
}
3454+
3455+
while (_Seq) { // look for sequence of elements that are the right size
3456+
const auto _Size = _Seq->_Sz;
3457+
3458+
// match input character sequence to stored collating elements
3459+
if (_Str.size() >= _Size) {
3460+
const _Elem* const _Str_first = _Str.data();
3461+
const _Elem* const _Str_last = _Str_first + _Size;
3462+
const _Elem* _Current = _Seq->_Data._Str();
3463+
for (auto _Remaining = _Seq->_Data._Size(); _Remaining >= _Size; _Current += _Size, _Remaining -= _Size) {
3464+
if (_STD equal(_Str_first, _Str_last, _Current)) {
3465+
_STD advance(_First, static_cast<_Iter_diff_t<_BidIt>>(_Size));
3466+
return _First;
3467+
}
34343468
}
34353469
}
3436-
_Eq = _Eq->_Next;
3470+
_Seq = _Seq->_Next;
34373471
}
34383472
return _First;
34393473
}
@@ -3454,7 +3488,7 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap
34543488
_It _Resx;
34553489
_Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Nx);
34563490
if (_Node->_Coll
3457-
&& (_Resx = _STD _Lookup_coll(_Tgt_state._Cur, _End, _Node->_Coll))
3491+
&& (_Resx = _STD _Lookup_coll2(_Ch, _Tgt_state._Cur, _End, _Node->_Coll, _Traits, _Sflags))
34583492
!= _Tgt_state._Cur) { // check for collation element
34593493
_Res0 = _Resx;
34603494
_Found = true;
@@ -3470,7 +3504,7 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap
34703504
_Found = true;
34713505
} else if (_Node->_Classes != typename _RxTraits::char_class_type{} && _Traits.isctype(_Ch, _Node->_Classes)) {
34723506
_Found = true;
3473-
} else if (_Node->_Equiv && _STD _Lookup_equiv(_UCh, _Node->_Equiv, _Traits)) {
3507+
} else if (_Node->_Equiv && _STD _Lookup_equiv2(_Ch, _Node->_Equiv, _Traits)) {
34743508
_Found = true;
34753509
} else {
34763510
_Found = false;
@@ -3811,10 +3845,9 @@ _BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt
38113845
const auto _UCh = static_cast<typename _RxTraits::_Uelem>(_Ch);
38123846

38133847
_Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Nx);
3814-
_It _Next = _First_arg;
3815-
++_Next;
38163848

3817-
if (_Node->_Coll && _STD _Lookup_coll(_First_arg, _Next, _Node->_Coll) != _First_arg) {
3849+
if (_Node->_Coll
3850+
&& _STD _Lookup_coll2(_Ch, _First_arg, _Last, _Node->_Coll, _Traits, _Sflags) != _First_arg) {
38183851
_Found = true;
38193852
} else if (_Node->_Ranges
38203853
&& (_Sflags & regex_constants::collate
@@ -3830,7 +3863,7 @@ _BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt
38303863
} else if (_Node->_Classes != typename _RxTraits::char_class_type{}
38313864
&& _Traits.isctype(_Ch, _Node->_Classes)) {
38323865
_Found = true;
3833-
} else if (_Node->_Equiv && _STD _Lookup_equiv(_UCh, _Node->_Equiv, _Traits)) {
3866+
} else if (_Node->_Equiv && _STD _Lookup_equiv2(_Ch, _Node->_Equiv, _Traits)) {
38343867
_Found = true;
38353868
} else {
38363869
_Found = false;
@@ -4074,45 +4107,68 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_OctalDigits() { // check for up to 3 oc
40744107
}
40754108

40764109
template <class _FwdIt, class _Elem, class _RxTraits>
4077-
void _Parser<_FwdIt, _Elem, _RxTraits>::_Do_ex_class(
4110+
_Prs_ret _Parser<_FwdIt, _Elem, _RxTraits>::_Do_ex_class2(
40784111
_Meta_type _End_arg) { // handle delimited expressions within bracket expression
4079-
regex_constants::error_type _Errtype = (_End_arg == _Meta_colon ? regex_constants::error_ctype
4080-
: _End_arg == _Meta_equal ? regex_constants::error_collate
4081-
: _End_arg == _Meta_dot ? regex_constants::error_collate
4082-
: regex_constants::error_syntax);
4083-
_FwdIt _Beg = _Pat;
4084-
_Iter_diff_t<_FwdIt> _Diff = 0;
4112+
const regex_constants::error_type _Errtype =
4113+
_End_arg == _Meta_colon ? regex_constants::error_ctype : regex_constants::error_collate;
4114+
const _FwdIt _Beg = _Pat;
40854115

40864116
while (_Mchar != _Meta_colon && _Mchar != _Meta_equal && _Mchar != _Meta_dot && _Mchar != _Meta_eos) {
40874117
// advance to end delimiter
40884118
_Next();
4089-
++_Diff;
40904119
}
4091-
if (_Mchar != _End_arg) {
4092-
_Error(_Errtype);
4093-
} else if (_End_arg == _Meta_colon) { // handle named character class
4120+
4121+
const _FwdIt _End = _Pat;
4122+
_Expect(_End_arg, _Errtype);
4123+
_Expect(_Meta_rsq, _Errtype);
4124+
4125+
if (_End_arg == _Meta_colon) { // handle named character class
40944126
typename _RxTraits::char_class_type _Cls =
4095-
_Traits.lookup_classname(_Beg, _Pat, (_Flags & regex_constants::icase) != 0);
4127+
_Traits.lookup_classname(_Beg, _End, (_Flags & regex_constants::icase) != 0);
40964128
if (!_Cls) {
40974129
_Error(regex_constants::error_ctype);
40984130
}
40994131

41004132
_Nfa._Add_named_class(_Cls, false);
4101-
} else if (_End_arg == _Meta_equal) { // process =
4102-
if (_Beg == _Pat) {
4133+
return _Prs_set;
4134+
} else {
4135+
typename _RxTraits::string_type _Coll_elem = _Traits.lookup_collatename(_Beg, _End);
4136+
const auto _Size = _Coll_elem.size();
4137+
4138+
if (_Size == 0) {
41034139
_Error(regex_constants::error_collate);
4104-
} else {
4105-
_Nfa._Add_equiv(_Beg, _Pat, _Diff);
41064140
}
4107-
} else if (_End_arg == _Meta_dot) { // process .
4108-
if (_Beg == _Pat) {
4109-
_Error(regex_constants::error_collate);
4110-
} else {
4111-
_Nfa._Add_coll(_Beg, _Pat, _Diff);
4141+
4142+
if (_Size > _Max_limit<unsigned int>()) {
4143+
_Error(regex_constants::error_space);
4144+
}
4145+
4146+
_Elem* const _Coll_elem_first = &_Coll_elem.front();
4147+
const _Elem* const _Coll_elem_last = _Coll_elem_first + _Size;
4148+
if (_End_arg == _Meta_equal) { // process equivalence
4149+
_Nfa._Add_equiv2(_Coll_elem_first, _Coll_elem_last);
4150+
return _Prs_set;
4151+
} else { // process collating element
4152+
if (_Size == 1) {
4153+
_Val = *_Coll_elem_first;
4154+
return _Prs_chr;
4155+
}
4156+
4157+
// Character ranges with multi-character bounds cannot be represented in NFA nodes yet (see GH-5391).
4158+
// Provisionally treat multi-character collating elements as character sets.
4159+
if (_Flags & regex_constants::icase) {
4160+
for (auto _Current = _Coll_elem_first; _Current != _Coll_elem_last; ++_Current) {
4161+
*_Current = _Traits.translate_nocase(*_Current);
4162+
}
4163+
} else if (_Flags & regex_constants::collate) {
4164+
for (auto _Current = _Coll_elem_first; _Current != _Coll_elem_last; ++_Current) {
4165+
*_Current = _Traits.translate(*_Current);
4166+
}
4167+
}
4168+
_Nfa._Add_coll2(_Coll_elem_first, _Coll_elem_last);
4169+
return _Prs_set;
41124170
}
41134171
}
4114-
_Next();
4115-
_Expect(_Meta_rsq, _Errtype);
41164172
}
41174173

41184174
template <class _FwdIt, class _Elem, class _RxTraits>
@@ -4172,8 +4228,7 @@ _Prs_ret _Parser<_FwdIt, _Elem, _RxTraits>::_ClassAtom(const bool _Initial) { //
41724228
if (_Mchar == _Meta_colon || _Mchar == _Meta_equal || _Mchar == _Meta_dot) { // handle delimited expression
41734229
_Meta_type _St = _Mchar;
41744230
_Next();
4175-
_Do_ex_class(_St);
4176-
return _Prs_set;
4231+
return _Do_ex_class2(_St);
41774232
} else { // handle ordinary [
41784233
_Val = _Meta_lsq;
41794234
return _Prs_chr;
@@ -4621,7 +4676,9 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Disjunction() { // check for valid disj
46214676
}
46224677
}
46234678

4624-
inline void _Calculate_loop_simplicity(_Node_base* _Nx, _Node_base* _Ne, _Node_rep* _Outer_rep) {
4679+
template <class _FwdIt, class _Elem, class _RxTraits>
4680+
void _Parser<_FwdIt, _Elem, _RxTraits>::_Calculate_loop_simplicity(
4681+
_Node_base* _Nx, _Node_base* _Ne, _Node_rep* _Outer_rep) {
46254682
// walks regex NFA, calculates values of _Node_rep::_Simple_loop
46264683
for (; _Nx != _Ne && _Nx; _Nx = _Nx->_Next) {
46274684
switch (_Nx->_Kind) {
@@ -4662,14 +4719,26 @@ inline void _Calculate_loop_simplicity(_Node_base* _Nx, _Node_base* _Ne, _Node_r
46624719
_Outer_rep = nullptr;
46634720
}
46644721
break;
4722+
case _N_class:
4723+
if (_Outer_rep) {
4724+
// _Node_rep is not simple if a class can match character sequences of different lengths
4725+
auto _Node = static_cast<const _Node_class<_Elem, _RxTraits>*>(_Nx);
4726+
bool _Coll_diff_size =
4727+
_Node->_Coll
4728+
&& (_Node->_Small || _Node->_Large || _Node->_Ranges || _Node->_Classes || _Node->_Coll->_Next);
4729+
if (_Coll_diff_size || _Node->_Equiv
4730+
|| ((_Flags & regex_constants::collate) && (_Node->_Ranges || (_Node->_Flags & _Fl_negate)))) {
4731+
_Outer_rep->_Simple_loop = 0;
4732+
}
4733+
}
4734+
break;
46654735
case _N_none:
46664736
case _N_nop:
46674737
case _N_bol:
46684738
case _N_eol:
46694739
case _N_wbound:
46704740
case _N_dot:
46714741
case _N_str:
4672-
case _N_class:
46734742
case _N_group:
46744743
case _N_end_group:
46754744
case _N_end_assert:

0 commit comments

Comments
 (0)