Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 126 additions & 91 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ namespace regex_constants {
grep = 0x10,
egrep = 0x20,
_Gmask = 0x3F,
_Any_posix = basic | extended | grep | egrep | awk,

icase = 0x0100,
nosubs = 0x0200,
Expand Down Expand Up @@ -3199,10 +3200,14 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_rep2(int _Min, int _Max, bool _Gre
_Node_base* _Pos = _Current;
if (_Pos->_Kind == _N_end_group || _Pos->_Kind == _N_end_capture) {
_Pos = static_cast<_Node_end_group*>(_Pos)->_Back;
}

if (_Min == 0 && _Max == 1) { // rewrite zero-or-one quantifiers as alternations to make the
// "simple loop" optimization more likely to engage
} else if (_Min == 0 && _Max == 1) {
// Rewrite zero-or-one quantifiers as alternations to make the
// "simple loop" optimization more likely to engage.
//
// GH-5490: This rewrite becomes observably incorrect
// if the subexpression contains capture groups,
// so we don't apply it if the subexpression is surrounded
// by a capturing or non-capturing group.
_Node_endif* _End = new _Node_endif;
_Node_if* _If_expr = new _Node_if(_End);
_Node_if* _If_empty_str = new _Node_if(_End);
Expand All @@ -3226,13 +3231,14 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_rep2(int _Min, int _Max, bool _Gre
swap(_If_expr->_Next->_Prev, _If_empty_str->_Next->_Prev); // intentional ADL
swap(_If_expr->_Next, _If_empty_str->_Next); // intentional ADL
}
} else {
_Node_end_rep* _Node0 = new _Node_end_rep();
_Node_rep* _Nx = new _Node_rep(_Greedy, _Min, _Max, _Node0, _Root->_Loops++);
_Node0->_Begin_rep = _Nx;
_Link_node(_Node0);
_Insert_node(_Pos, _Nx);
return;
}

_Node_end_rep* _Node0 = new _Node_end_rep();
_Node_rep* _Nx = new _Node_rep(_Greedy, _Min, _Max, _Node0, _Root->_Loops++);
_Node0->_Begin_rep = _Nx;
_Link_node(_Node0);
_Insert_node(_Pos, _Nx);
}

template <class _FwdIt, class _Elem, class _RxTraits>
Expand Down Expand Up @@ -3311,26 +3317,30 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node
int _Ix = 0;
_Tgt_state_t<_It> _St = _Tgt_state;

for (; _Ix < _Node->_Min; ++_Ix) { // do minimum number of reps
// GH-5365: We have to reset the capture groups from the second iteration on.
// We can avoid the reset for the first iteration
// because we know that a simple repetition was not encountered before.
if (_Ix > 0) {
_Tgt_state._Grp_valid = _St._Grp_valid;
}

_It _Cur = _Tgt_state._Cur;
if (0 < _Node->_Min) {
// GH-5365: We can avoid resetting capture groups for the first iteration
// because we know that a simple repetition of this loop was not encountered before.
if (!_Match_pat(_Node->_Next)) { // didn't match minimum number of reps, fail
_Tgt_state = _St;
return false;
} else if (_Cur == _Tgt_state._Cur) {
_Ix = _Node->_Min - 1; // skip matches that don't change state
} else if (_Tgt_state._Cur == _St._Cur) { // matches empty string
// loop is branchless, so it will only ever match empty strings
// -> skip all other matches as they don't change state and immediately try tail
return _Match_pat(_Node->_End_rep->_Next);
} else { // loop never matches the empty string
for (_Ix = 1; _Ix < _Node->_Min; ++_Ix) { // do minimum number of reps
// GH-5365: We have to reset the capture groups from the second iteration on.
_Tgt_state._Grp_valid = _St._Grp_valid;
if (!_Match_pat(_Node->_Next)) { // didn't match minimum number of reps, fail
return false;
}
}
}
}

_Tgt_state_t<_It> _Final = _Tgt_state;
bool _Matched0 = false;
_It _Saved_pos = _Tgt_state._Cur;
_Tgt_state_t<_It> _Final;
bool _Matched0 = false;
_It _Saved_pos = _Tgt_state._Cur;
bool _Done = false;

if (_Match_pat(_Node->_End_rep->_Next)) {
if (!_Greedy) {
Expand All @@ -3342,32 +3352,58 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node
_Matched0 = true;
}

while (_Node->_Max == -1 || _Ix++ < _Node->_Max) { // try another rep/tail match
if (_Ix == 0 && _Node->_Max != 0) {
_Tgt_state._Cur = _Saved_pos;
_Tgt_state._Grp_valid = _St._Grp_valid;
if (!_Match_pat(_Node->_Next)) {
break; // rep match failed, quit loop
}

_It _Mid = _Tgt_state._Cur;
if (_Match_pat(_Node->_End_rep->_Next)) {
if (!_Greedy) {
if (!_Match_pat(_Node->_Next)) { // rep match failed, we are done
_Done = true;
} else if (_Saved_pos == _Tgt_state._Cur) { // match empty, try no more repetitions
_Done = true;
// we only potentially accept/try tail for POSIX
if ((_Sflags & regex_constants::_Any_posix) && _Match_pat(_Node->_End_rep->_Next)) {
return true; // go with current match
}
} else {
_Saved_pos = _Tgt_state._Cur;
if (_Match_pat(_Node->_End_rep->_Next)) {
if (!_Greedy) {
return true; // go with current match
}

// record match and continue
_Final = _Tgt_state;
_Matched0 = true;
// record match and continue
_Final = _Tgt_state;
_Matched0 = true;
}
}
_Ix = 1;
}

if (_Saved_pos == _Mid) {
break; // rep match ate no additional elements, quit loop
}
if (!_Done) {
while (_Node->_Max == -1 || _Ix++ < _Node->_Max) { // try another rep/tail match
_Tgt_state._Cur = _Saved_pos;
_Tgt_state._Grp_valid = _St._Grp_valid;
if (!_Match_pat(_Node->_Next) || _Tgt_state._Cur == _Saved_pos) {
break; // rep match failed, quit loop
}

_Saved_pos = _Mid;
// since loop is branchless, empty rep match is not possible at this point
_Saved_pos = _Tgt_state._Cur;
if (_Match_pat(_Node->_End_rep->_Next)) {
if (!_Greedy) {
return true; // go with current match
}

// record match and continue
_Final = _Tgt_state;
_Matched0 = true;
}
}
}

_Tgt_state = _Matched0 ? _Final : _St;
if (_Matched0) { // record final match
_Tgt_state = _Final;
}
return _Matched0;
}

Expand All @@ -3381,61 +3417,56 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node,
_It* _Loop_iter_sav = static_cast<_It*>(_Psav->_Loop_iter);
bool _Progress = _Init_idx == 0 || *_Loop_iter_sav != _St._Cur;

if (0 <= _Node->_Max && _Node->_Max <= _Init_idx) {
_Matched0 = _Match_pat(_Node->_End_rep->_Next); // reps done, try tail
} else if (_Init_idx < _Node->_Min) { // try a required rep
if (!_Progress) {
_Matched0 = _Match_pat(_Node->_End_rep->_Next); // empty, try tail
} else { // try another required match
_Psav->_Loop_idx = _Init_idx + 1;
_Psav->_Loop_iter = _STD addressof(_St._Cur);
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
_Tgt_state._Grp_valid.end(), false);
_Matched0 = _Match_pat(_Node->_Next);
}
} else if (_Longest) { // longest, try any number of repetitions

// match with no further repetition
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
// match with at least one more repetition if last repetition made progress
if (_Progress) {
if (_Init_idx < _Node->_Min) { // try another required match
_Psav->_Loop_iter = _STD addressof(_St._Cur);
_Psav->_Loop_idx = _Progress ? _Init_idx + 1 : _Node->_Min; // try only one more match after an empty match
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
_Tgt_state._Grp_valid.end(), false);
_Matched0 = _Match_pat(_Node->_Next);
} else if (_Init_idx == _Node->_Min || _Progress) {
if (0 <= _Node->_Max && _Node->_Max <= _Init_idx) {
_Matched0 = _Match_pat(_Node->_End_rep->_Next); // reps done, try tail
} else if (_Longest) { // longest, try any number of repetitions

// match with no further repetition
_Matched0 = _Match_pat(_Node->_End_rep->_Next);

// try to match with one more repetition
_Tgt_state = _St;
_Psav->_Loop_idx = _Init_idx + 1;
_Psav->_Loop_iter = _STD addressof(_St._Cur);

if (_Match_pat(_Node->_Next)) { // always call _Match_pat, even when _Matched0 is already true
_Matched0 = true;
}
}
} else if (!_Greedy) { // not greedy, favor minimum number of reps
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
if (!_Matched0 && _Progress) { // tail failed, try another rep
_Tgt_state = _St;
_Psav->_Loop_idx = _Init_idx + 1;
_Psav->_Loop_iter = _STD addressof(_St._Cur);
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
_Tgt_state._Grp_valid.end(), false);
_Matched0 = _Match_pat(_Node->_Next);
}
} else { // greedy, favor maximum number of reps
if (_Progress) { // try another rep
} else if (!_Greedy) { // not greedy, favor minimum number of reps
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
if (!_Matched0) { // tail failed, try another rep
_Tgt_state = _St;
_Psav->_Loop_idx = _Init_idx + 1;
_Psav->_Loop_iter = _STD addressof(_St._Cur);
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
_Tgt_state._Grp_valid.end(), false);
_Matched0 = _Match_pat(_Node->_Next);
}
} else { // greedy, favor maximum number of reps,
// so try another rep
_Psav->_Loop_idx = _Init_idx + 1;
_Psav->_Loop_iter = _STD addressof(_St._Cur);
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
_Tgt_state._Grp_valid.end(), false);
_Matched0 = _Match_pat(_Node->_Next);
}

if ((_Progress || 1 >= _Init_idx) && !_Matched0) { // rep failed, try tail
_Psav->_Loop_idx = _Loop_idx_sav;
_Psav->_Loop_iter = _Loop_iter_sav;
_Tgt_state = _St;
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
if (!_Matched0) { // rep failed, try tail
_Psav->_Loop_idx = _Loop_idx_sav;
_Psav->_Loop_iter = _Loop_iter_sav;
_Tgt_state = _St;
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
}
}
}

if (!_Matched0) {
_Tgt_state = _St;
} else if (_Init_idx == 1 && (_Sflags & regex_constants::_Any_posix)) {
// POSIX allows an empty repetition if the subexpression is matched only once,
// so try trail
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
}

_Psav->_Loop_idx = _Loop_idx_sav;
Expand All @@ -3456,9 +3487,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep_first(_Node_rep*
// No capture group reset is performed for POSIX regexes,
// so we prevent any reset by setting the first capture group to the number of capture groups _Ncap.
if (_Psav->_Group_first == 0) {
constexpr auto _Any_posix = regex_constants::basic | regex_constants::extended | regex_constants::grep
| regex_constants::egrep | regex_constants::awk;
if ((_Sflags & _Any_posix) || !_Find_first_inner_capture_group(_Node->_Next, _Psav)) {
if ((_Sflags & regex_constants::_Any_posix) || !_Find_first_inner_capture_group(_Node->_Next, _Psav)) {
_Psav->_Group_first = _Ncap;
}
}
Expand Down Expand Up @@ -3833,10 +3862,8 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
if (_Tgt_state._Cur == _End) {
_Failed = true;
} else {
constexpr auto _Any_posix = regex_constants::basic | regex_constants::extended | regex_constants::grep
| regex_constants::egrep | regex_constants::awk;
const _Elem _Ch = *_Tgt_state._Cur;
if (_Sflags & _Any_posix) {
if (_Sflags & regex_constants::_Any_posix) {
if (_Ch == _Elem()) {
_Failed = true;
}
Expand Down Expand Up @@ -4923,10 +4950,18 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Calculate_loop_simplicity(
_Calculate_loop_simplicity(static_cast<_Node_assert*>(_Nx)->_Child, nullptr, nullptr);
break;
case _N_rep:
// _Node_rep inside another _Node_rep makes both not simple
// _Node_rep inside another _Node_rep makes both not simple if _Outer_rep can be repeated more than once
// because _Matcher::_Do_rep0() does not reset capture group boundaries when control is returned to it.
// If _Outer_rep can repeat at most once, we have to analyze the structure of the inner loop.
if (_Outer_rep) {
_Outer_rep->_Simple_loop = 0;
static_cast<_Node_rep*>(_Nx)->_Simple_loop = 0;
_Outer_rep->_Simple_loop = 0;
auto _Inner_rep = static_cast<_Node_rep*>(_Nx);
if (_Outer_rep->_Max >= 0 && _Outer_rep->_Max <= 1) {
_Calculate_loop_simplicity(_Inner_rep->_Next, _Inner_rep->_End_rep->_Next, _Inner_rep);
_Nx = _Inner_rep->_End_rep;
} else {
_Inner_rep->_Simple_loop = 0;
}
} else {
_Outer_rep = static_cast<_Node_rep*>(_Nx);
}
Expand Down
Loading