From cff04c684c0492e5dc3a16ec9284db6ba7c480b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Thu, 1 May 2025 17:59:04 +0200 Subject: [PATCH 01/12] rename matcher and add template parameter for future allocator support --- stl/inc/regex | 72 ++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index b2023ebd87c..8e80f161283 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1599,10 +1599,10 @@ public: } }; -template -class _Matcher { // provides ways to match a regular expression to a text sequence +template +class _Matcher2 { // provides ways to match a regular expression to a text sequence public: - _Matcher(_It _Pfirst, _It _Plast, const _RxTraits& _Tr, _Root_node* _Re, unsigned int _Nx, + _Matcher2(_It _Pfirst, _It _Plast, const _RxTraits& _Tr, _Root_node* _Re, unsigned int _Nx, regex_constants::syntax_option_type _Sf, regex_constants::match_flag_type _Mf) : _End(_Plast), _First(_Pfirst), _Rep(_Re), _Sflags(_Sf), _Mflags(_Mf), _Ncap(static_cast(_Nx)), _Longest((_Re->_Flags & _Fl_longest) && !(_Mf & regex_constants::match_any)), _Traits(_Tr) { @@ -1618,15 +1618,15 @@ public: _Mflags &= ~_Mf; } - template - bool _Match(_It _Pfirst, match_results<_BidIt, _Alloc>* _Matches, bool _Full_match) { + template + bool _Match(_It _Pfirst, match_results<_BidIt, _Alsubmatch>* _Matches, bool _Full_match) { // try to match _First = _Pfirst; return _Match(_Matches, _Full_match); } - template - bool _Match(match_results<_BidIt, _Alloc>* _Matches, bool _Full_match) { + template + bool _Match(match_results<_BidIt, _Alsubmatch>* _Matches, bool _Full_match) { // try to match if (_Matches) { // clear _Matches before doing work _Matches->_Ready = true; @@ -1713,7 +1713,7 @@ private: long _Max_stack_count; public: - _Matcher& operator=(const _Matcher&) = delete; + _Matcher2& operator=(const _Matcher2&) = delete; }; enum _Prs_ret { // indicate class element type @@ -2186,7 +2186,7 @@ bool _Regex_match1(_It _First, _It _Last, match_results<_BidIt, _Alloc>* _Matche return false; } - _Matcher<_BidIt, _Elem, _RxTraits, _It> _Mx( + _Matcher2<_BidIt, _Elem, _RxTraits, _It, void> _Mx( _First, _Last, _Re._Get_traits(), _Re._Get(), _Re.mark_count() + 1, _Re.flags(), _Flgs); return _Mx._Match(_Matches, _Full); } @@ -2261,7 +2261,7 @@ bool _Regex_search2(_It _First, _It _Last, match_results<_BidIt, _Alloc>* _Match ++_First; } - _Matcher<_BidIt, _Elem, _RxTraits, _It> _Mx( + _Matcher2<_BidIt, _Elem, _RxTraits, _It, void> _Mx( _First, _Last, _Re._Get_traits(), _Re._Get(), _Re.mark_count() + 1, _Re.flags(), _Flgs); if (_Mx._Match(_Matches, false)) { @@ -3174,8 +3174,8 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Tidy() noexcept { // free memory _Root = nullptr; } -template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_assert(_Node_assert* _Node) { // apply assert node +template +bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_assert(_Node_assert* _Node) { // apply assert node _It _Ch = _Tgt_state._Cur; if (_Match_pat(_Node->_Child)) { _Tgt_state._Cur = _Ch; @@ -3185,8 +3185,9 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_assert(_Node_assert* _Node) { } } -template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_neg_assert(_Node_assert* _Node) { // apply negative assert node +template +bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_neg_assert(_Node_assert* _Node) { + // apply negative assert node _Bt_state_t<_It> _St = _Tgt_state; if (!_Match_pat(_Node->_Child)) { _Tgt_state = _St; @@ -3196,8 +3197,8 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_neg_assert(_Node_assert* _Node } } -template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_if(_Node_if* _Node) { // apply if node +template +bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_if(_Node_if* _Node) { // apply if node _Tgt_state_t<_It> _St = _Tgt_state; // look for the first match @@ -3231,9 +3232,9 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_if(_Node_if* _Node) { // apply return true; } -template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_rep0( - _Node_rep* _Node, bool _Greedy) { // apply repetition to loop with no nested if/do +template +bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node, bool _Greedy) { + // apply repetition to loop with no nested if/do int _Ix = 0; _Tgt_state_t<_It> _St = _Tgt_state; @@ -3290,8 +3291,8 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_rep0( return _Matched0; } -template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_rep(_Node_rep* _Node, bool _Greedy, int _Init_idx) { +template +bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, bool _Greedy, int _Init_idx) { // apply repetition if (_Node->_Simple_loop == 1) { return _Do_rep0(_Node, _Greedy); @@ -3482,8 +3483,8 @@ _BidIt _Lookup_coll2(_Elem _First_ch, _BidIt _First, const _BidIt _Last, const _ return _First; } -template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // apply bracket expression +template +bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_class(_Node_base* _Nx) { // apply bracket expression bool _Found; _Elem _Ch = *_Tgt_state._Cur; if (_Sflags & regex_constants::icase) { @@ -3539,8 +3540,9 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap } } -template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under leftmost-longest rule +template +bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Better_match() { + // check for better match under leftmost-longest rule for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group // any match (even an empty one) is better than no match at all if (_Res._Grp_valid[_Ix] != _Tgt_state._Grp_valid[_Ix]) { @@ -3564,8 +3566,8 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for bet return false; } -template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Is_wbound() const { +template +bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Is_wbound() const { if ((_Mflags & regex_constants::match_prev_avail) || _Tgt_state._Cur != _Begin) { // if --_Cur is valid, check for preceding word character if (_Tgt_state._Cur == _End) { @@ -3582,21 +3584,21 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Is_wbound() const { } } -template -typename _RxTraits::char_class_type _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Lookup_char_class( +template +typename _RxTraits::char_class_type _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Lookup_char_class( const _Elem _Class_name) const { // look up character class with single-character name auto _Ptr = _STD addressof(_Class_name); return _Traits.lookup_classname(_Ptr, _Ptr + 1, (_Sflags & regex_constants::icase) != 0); } -template -unsigned int _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Get_ncap() const { +template +unsigned int _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Get_ncap() const { return static_cast(_Ncap); } -template -bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Match_pat(_Node_base* _Nx) { // check for match +template +bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _Nx) { // check for match if (0 < _Max_stack_count && --_Max_stack_count <= 0) { _Xregex_error(regex_constants::error_stack); } @@ -3817,8 +3819,8 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Match_pat(_Node_base* _Nx) { // c return !_Failed; } -template -_BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt _Last, _Node_base* _Node_arg) { +template +_BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip(_BidIt _First_arg, _BidIt _Last, _Node_base* _Node_arg) { // skip until possible match // assumes --_First_arg is valid _Node_base* _Nx = _Node_arg ? _Node_arg : _Rep; From 5ef92673775451ccff966cf8f5789a02538f3963 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Thu, 1 May 2025 17:59:41 +0200 Subject: [PATCH 02/12] adjust _Matcher2::_Ncap's type --- stl/inc/regex | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 8e80f161283..1afeb6ab4c2 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1604,7 +1604,7 @@ class _Matcher2 { // provides ways to match a regular expression to a text seque public: _Matcher2(_It _Pfirst, _It _Plast, const _RxTraits& _Tr, _Root_node* _Re, unsigned int _Nx, regex_constants::syntax_option_type _Sf, regex_constants::match_flag_type _Mf) - : _End(_Plast), _First(_Pfirst), _Rep(_Re), _Sflags(_Sf), _Mflags(_Mf), _Ncap(static_cast(_Nx)), + : _End(_Plast), _First(_Pfirst), _Rep(_Re), _Sflags(_Sf), _Mflags(_Mf), _Ncap(_Nx), _Longest((_Re->_Flags & _Fl_longest) && !(_Mf & regex_constants::match_any)), _Traits(_Tr) { _Loop_vals.resize(_Re->_Loops); _Adl_verify_range(_Pfirst, _Plast); @@ -1705,7 +1705,7 @@ private: regex_constants::match_flag_type _Mflags; bool _Matched = false; bool _Cap; - int _Ncap; // Do not use. Use _Get_ncap instead. + unsigned int _Ncap; bool _Longest; const _RxTraits& _Traits; bool _Full; @@ -3594,7 +3594,7 @@ typename _RxTraits::char_class_type _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Al template unsigned int _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Get_ncap() const { - return static_cast(_Ncap); + return _Ncap; } template From 3f6e6ec31c77af47b2e903a6319f55201a7178ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Thu, 1 May 2025 17:59:59 +0200 Subject: [PATCH 03/12] remove `_Matcher2::_Get_ncap()` --- stl/inc/regex | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 1afeb6ab4c2..159bd50b851 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1635,8 +1635,8 @@ public: _Begin = _First; _Tgt_state._Cur = _First; - _Tgt_state._Grp_valid.resize(_Get_ncap()); - _Tgt_state._Grps.resize(_Get_ncap()); + _Tgt_state._Grp_valid.resize(_Ncap); + _Tgt_state._Grps.resize(_Ncap); _Cap = static_cast(_Matches); _Full = _Full_match; _Max_complexity_count = _REGEX_MAX_COMPLEXITY_COUNT; @@ -1649,9 +1649,9 @@ public: } if (_Matches) { // copy results to _Matches - _Matches->_Resize(_Get_ncap()); + _Matches->_Resize(_Ncap); const auto& _Result = _Longest ? _Res : _Tgt_state; - for (unsigned int _Idx = 0; _Idx < _Get_ncap(); ++_Idx) { // copy submatch _Idx + for (unsigned int _Idx = 0; _Idx < _Ncap; ++_Idx) { // copy submatch _Idx if (_Result._Grp_valid[_Idx]) { // copy successful match _Matches->_At(_Idx).matched = true; _Matches->_At(_Idx).first = _Result._Grps[_Idx]._Begin; @@ -1695,8 +1695,6 @@ private: bool _Is_wbound() const; typename _RxTraits::char_class_type _Lookup_char_class(_Elem) const; - unsigned int _Get_ncap() const; - _It _Begin; _It _End; _It _First; @@ -3543,7 +3541,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_class(_Node_base* _Nx template bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Better_match() { // check for better match under leftmost-longest rule - for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group + for (unsigned int _Ix = 0; _Ix < _Ncap; ++_Ix) { // check each capture group // any match (even an empty one) is better than no match at all if (_Res._Grp_valid[_Ix] != _Tgt_state._Grp_valid[_Ix]) { return _Tgt_state._Grp_valid[_Ix]; @@ -3592,11 +3590,6 @@ typename _RxTraits::char_class_type _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Al return _Traits.lookup_classname(_Ptr, _Ptr + 1, (_Sflags & regex_constants::icase) != 0); } -template -unsigned int _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Get_ncap() const { - return _Ncap; -} - template bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _Nx) { // check for match if (0 < _Max_stack_count && --_Max_stack_count <= 0) { From f252cabd500b1d47e9a9e608c54f151b9f94bc78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Thu, 1 May 2025 18:07:27 +0200 Subject: [PATCH 04/12] remove duplicate member `_Matcher2::_First` in favor of `_Matcher2::_Begin` --- stl/inc/regex | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 159bd50b851..4d1bb4b1a61 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1604,7 +1604,7 @@ class _Matcher2 { // provides ways to match a regular expression to a text seque public: _Matcher2(_It _Pfirst, _It _Plast, const _RxTraits& _Tr, _Root_node* _Re, unsigned int _Nx, regex_constants::syntax_option_type _Sf, regex_constants::match_flag_type _Mf) - : _End(_Plast), _First(_Pfirst), _Rep(_Re), _Sflags(_Sf), _Mflags(_Mf), _Ncap(_Nx), + : _Begin(_Pfirst), _End(_Plast), _Rep(_Re), _Sflags(_Sf), _Mflags(_Mf), _Ncap(_Nx), _Longest((_Re->_Flags & _Fl_longest) && !(_Mf & regex_constants::match_any)), _Traits(_Tr) { _Loop_vals.resize(_Re->_Loops); _Adl_verify_range(_Pfirst, _Plast); @@ -1621,7 +1621,7 @@ public: template bool _Match(_It _Pfirst, match_results<_BidIt, _Alsubmatch>* _Matches, bool _Full_match) { // try to match - _First = _Pfirst; + _Begin = _Pfirst; return _Match(_Matches, _Full_match); } @@ -1633,8 +1633,7 @@ public: _Matches->_Resize(0); } - _Begin = _First; - _Tgt_state._Cur = _First; + _Tgt_state._Cur = _Begin; _Tgt_state._Grp_valid.resize(_Ncap); _Tgt_state._Grps.resize(_Ncap); _Cap = static_cast(_Matches); @@ -1697,7 +1696,6 @@ private: _It _Begin; _It _End; - _It _First; _Node_base* _Rep; regex_constants::syntax_option_type _Sflags; regex_constants::match_flag_type _Mflags; From 1d225e8c15cff1e5d7069d2b4a60cfe0df96aca8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Thu, 1 May 2025 18:11:06 +0200 Subject: [PATCH 05/12] `_Do_rep`: Remove unnecessary local variable `_Cur_iter` and replace its uses by `_St._Cur` --- stl/inc/regex | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 4d1bb4b1a61..c85fa977c92 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3299,9 +3299,8 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, _Loop_vals_t* _Psav = &_Loop_vals[_Node->_Loop_number]; int _Loop_idx_sav = _Psav->_Loop_idx; _It* _Loop_iter_sav = static_cast<_It*>(_Psav->_Loop_iter); - _It _Cur_iter = _Tgt_state._Cur; - bool _Progress = _Init_idx == 0 || *_Loop_iter_sav != _Cur_iter; + bool _Progress = _Init_idx == 0 || *_Loop_iter_sav != _St._Cur; if (0 <= _Node->_Max && _Node->_Max <= _Init_idx) { _Matched0 = _Match_pat(_Node->_End_rep->_Next); // reps done, try tail @@ -3310,7 +3309,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, _Matched0 = _Match_pat(_Node->_End_rep->_Next); // empty, try tail } else { // try another required match _Psav->_Loop_idx = _Init_idx + 1; - _Psav->_Loop_iter = _STD addressof(_Cur_iter); + _Psav->_Loop_iter = _STD addressof(_St._Cur); _Matched0 = _Match_pat(_Node->_Next); } } else if (_Longest) { // longest, try any number of repetitions @@ -3321,7 +3320,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, if (_Progress) { _Tgt_state = _St; _Psav->_Loop_idx = _Init_idx + 1; - _Psav->_Loop_iter = _STD addressof(_Cur_iter); + _Psav->_Loop_iter = _STD addressof(_St._Cur); if (_Match_pat(_Node->_Next)) { // always call _Match_pat, even when _Matched0 is already true _Matched0 = true; @@ -3332,13 +3331,13 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, if (!_Matched0 && _Progress) { // tail failed, try another rep _Tgt_state = _St; _Psav->_Loop_idx = _Init_idx + 1; - _Psav->_Loop_iter = _STD addressof(_Cur_iter); + _Psav->_Loop_iter = _STD addressof(_St._Cur); _Matched0 = _Match_pat(_Node->_Next); } } else { // greedy, favor maximum number of reps if (_Progress) { // try another rep _Psav->_Loop_idx = _Init_idx + 1; - _Psav->_Loop_iter = _STD addressof(_Cur_iter); + _Psav->_Loop_iter = _STD addressof(_St._Cur); _Matched0 = _Match_pat(_Node->_Next); } From 90f84c2fbb23780d49b95dee6540d4808126587b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Thu, 1 May 2025 18:11:23 +0200 Subject: [PATCH 06/12] Add capture group range to stored loop state and properly reset capture groups before each repetition --- stl/inc/regex | 184 +++++++++++++++--- .../std/tests/VSO_0000000_regex_use/test.cpp | 22 +++ 2 files changed, 175 insertions(+), 31 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index c85fa977c92..c0e32480fef 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1493,9 +1493,10 @@ public: _Node_end_rep& operator=(const _Node_end_rep&) = delete; }; -struct _Loop_vals_t { // storage for loop administration - int _Loop_idx; +struct _Loop_vals_v2_t { // storage for loop administration void* _Loop_iter; + int _Loop_idx; + unsigned int _Group_first; }; class _Node_rep : public _Node_base { // node that marks the beginning of a repetition @@ -1681,13 +1682,15 @@ public: private: _Tgt_state_t<_It> _Tgt_state; _Tgt_state_t<_It> _Res; - vector<_Loop_vals_t> _Loop_vals; + vector<_Loop_vals_v2_t> _Loop_vals; bool _Do_assert(_Node_assert*); bool _Do_neg_assert(_Node_assert*); bool _Do_if(_Node_if*); bool _Do_rep0(_Node_rep*, bool); bool _Do_rep(_Node_rep*, bool, int); + bool _Do_rep_first(_Node_rep*); + bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v2_t*); bool _Do_class(_Node_base*); bool _Match_pat(_Node_base*); bool _Better_match(); @@ -3235,6 +3238,13 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node _Tgt_state_t<_It> _St = _Tgt_state; for (; _Ix < _Node->_Min; ++_Ix) { // do minimum number of reps + // GH-5365: We have to reset the capture groups from the second iteration on. + // We can avoid the reset for the first iteration + // because we know that a simple repetition was not encountered before. + if (_Ix > 0) { + _Tgt_state._Grp_valid = _St._Grp_valid; + } + _It _Cur = _Tgt_state._Cur; if (!_Match_pat(_Node->_Next)) { // didn't match minimum number of reps, fail _Tgt_state = _St; @@ -3290,17 +3300,12 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node template bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, bool _Greedy, int _Init_idx) { // apply repetition - if (_Node->_Simple_loop == 1) { - return _Do_rep0(_Node, _Greedy); - } - - bool _Matched0 = false; - _Tgt_state_t<_It> _St = _Tgt_state; - _Loop_vals_t* _Psav = &_Loop_vals[_Node->_Loop_number]; - int _Loop_idx_sav = _Psav->_Loop_idx; - _It* _Loop_iter_sav = static_cast<_It*>(_Psav->_Loop_iter); - - bool _Progress = _Init_idx == 0 || *_Loop_iter_sav != _St._Cur; + bool _Matched0 = false; + _Tgt_state_t<_It> _St = _Tgt_state; + _Loop_vals_v2_t* _Psav = &_Loop_vals[_Node->_Loop_number]; + int _Loop_idx_sav = _Psav->_Loop_idx; + _It* _Loop_iter_sav = static_cast<_It*>(_Psav->_Loop_iter); + bool _Progress = _Init_idx == 0 || *_Loop_iter_sav != _St._Cur; if (0 <= _Node->_Max && _Node->_Max <= _Init_idx) { _Matched0 = _Match_pat(_Node->_End_rep->_Next); // reps done, try tail @@ -3310,7 +3315,9 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, } else { // try another required match _Psav->_Loop_idx = _Init_idx + 1; _Psav->_Loop_iter = _STD addressof(_St._Cur); - _Matched0 = _Match_pat(_Node->_Next); + _STD fill(_Tgt_state._Grp_valid.begin() + static_cast(_Psav->_Group_first), + _Tgt_state._Grp_valid.end(), false); + _Matched0 = _Match_pat(_Node->_Next); } } else if (_Longest) { // longest, try any number of repetitions @@ -3332,13 +3339,17 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, _Tgt_state = _St; _Psav->_Loop_idx = _Init_idx + 1; _Psav->_Loop_iter = _STD addressof(_St._Cur); - _Matched0 = _Match_pat(_Node->_Next); + _STD fill(_Tgt_state._Grp_valid.begin() + static_cast(_Psav->_Group_first), + _Tgt_state._Grp_valid.end(), false); + _Matched0 = _Match_pat(_Node->_Next); } } else { // greedy, favor maximum number of reps if (_Progress) { // try another rep _Psav->_Loop_idx = _Init_idx + 1; _Psav->_Loop_iter = _STD addressof(_St._Cur); - _Matched0 = _Match_pat(_Node->_Next); + _STD fill(_Tgt_state._Grp_valid.begin() + static_cast(_Psav->_Group_first), + _Tgt_state._Grp_valid.end(), false); + _Matched0 = _Match_pat(_Node->_Next); } if ((_Progress || 1 >= _Init_idx) && !_Matched0) { // rep failed, try tail @@ -3358,6 +3369,127 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, return _Matched0; } +template +bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep_first(_Node_rep* _Node) { + bool _Greedy = (_Node->_Flags & _Fl_greedy) != 0; + // apply repetition + if (_Node->_Simple_loop == 1) { + return _Do_rep0(_Node, _Greedy); + } + _Loop_vals_v2_t* _Psav = &_Loop_vals[_Node->_Loop_number]; + + // Determine first capture group in repetition for later capture group reset, if not done so previously. + // No capture group reset is performed for POSIX regexes, + // so we prevent any reset by setting the first capture group to the number of capture groups _Ncap. + if (_Psav->_Group_first == 0) { + if ((_Sflags + & (regex_constants::basic | regex_constants::extended | regex_constants::grep | regex_constants::egrep + | regex_constants::awk)) + || !_Find_first_inner_capture_group(_Node->_Next, _Psav)) { + _Psav->_Group_first = _Ncap; + } + } + + return _Do_rep(_Node, _Greedy, 0); +} + +template +bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Find_first_inner_capture_group( + _Node_base* _Nx, _Loop_vals_v2_t* _Loop_state) { + if (0 < _Max_stack_count && --_Max_stack_count <= 0) { + _Xregex_error(regex_constants::error_stack); + } + + bool _Found_group = false; + while (_Nx) { + switch (_Nx->_Kind) { + case _N_nop: + case _N_bol: + case _N_eol: + case _N_wbound: + case _N_dot: + case _N_str: + case _N_class: + case _N_group: + case _N_end_group: + case _N_end_capture: + case _N_back: + case _N_begin: + break; + + case _N_assert: + case _N_neg_assert: + { + if (_Find_first_inner_capture_group(static_cast<_Node_assert*>(_Nx), _Loop_state)) { + _Found_group = true; + _Nx = nullptr; + } + break; + } + + case _N_capture: + { + _Node_capture* _Node = static_cast<_Node_capture*>(_Nx); + _Loop_state->_Group_first = _Node->_Idx; + _Found_group = true; + _Nx = nullptr; + break; + } + + case _N_if: + { + _Node_if* _Node = static_cast<_Node_if*>(_Nx); + for (; _Node != nullptr; _Node = _Node->_Child) { + if (_Find_first_inner_capture_group(_Node->_Next, _Loop_state)) { + _Found_group = true; + _Nx = nullptr; + break; + } + } + + if (_Nx != nullptr) { // continue search after the branches of the _N_if node + _Nx = static_cast<_Node_if*>(_Nx)->_Endif; + } + break; + } + + case _N_rep: + { + _Node_rep* _Inner_rep = static_cast<_Node_rep*>(_Nx); + _Loop_vals_v2_t* _Inner_loop_state = &_Loop_vals[_Inner_rep->_Loop_number]; + if (_Find_first_inner_capture_group(_Inner_rep->_Next, _Inner_loop_state)) { + _Loop_state->_Group_first = _Inner_loop_state->_Group_first; + _Found_group = true; + _Nx = nullptr; + } else { + _Inner_loop_state->_Group_first = _Ncap; + _Nx = _Inner_rep->_End_rep; + } + break; + } + + case _N_end_assert: + case _N_endif: + case _N_end_rep: + case _N_end: + case _N_none: + default: + _Nx = nullptr; + break; + } + + if (_Nx) { + _Nx = _Nx->_Next; + } + } + + if (0 < _Max_stack_count) { + ++_Max_stack_count; + } + + return _Found_group; +} + template _BidIt1 _Cmp_chrange(_BidIt1 _Begin1, _BidIt1 _End1, _BidIt2 _Begin2, _BidIt2 _End2, _Pr _Pred) { // compare character ranges @@ -3695,15 +3827,6 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N { // record current position _Node_capture* _Node = static_cast<_Node_capture*>(_Nx); _Tgt_state._Grps[_Node->_Idx]._Begin = _Tgt_state._Cur; - if (!(_Sflags - & (regex_constants::basic | regex_constants::extended | regex_constants::grep - | regex_constants::egrep | regex_constants::awk))) { - // CodeQL [SM02323] Comparing unchanging unsigned int _Node->_Idx to decreasing size_t _Idx is safe. - for (size_t _Idx = _Tgt_state._Grp_valid.size(); _Node->_Idx < _Idx;) { - _Tgt_state._Grp_valid[--_Idx] = false; - } - } - break; } @@ -3752,7 +3875,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N break; case _N_rep: - if (!_Do_rep(static_cast<_Node_rep*>(_Nx), (_Nx->_Flags & _Fl_greedy) != 0, 0)) { + if (!_Do_rep_first(static_cast<_Node_rep*>(_Nx))) { _Failed = true; } @@ -3761,10 +3884,9 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N case _N_end_rep: { - _Node_rep* _Nr = static_cast<_Node_end_rep*>(_Nx)->_Begin_rep; - _Loop_vals_t* _Psav = &_Loop_vals[_Nr->_Loop_number]; - - if (_Nr->_Simple_loop == 0 && !_Do_rep(_Nr, (_Nr->_Flags & _Fl_greedy) != 0, _Psav->_Loop_idx)) { + _Node_rep* _Nr = static_cast<_Node_end_rep*>(_Nx)->_Begin_rep; + if (_Nr->_Simple_loop == 0 + && !_Do_rep(_Nr, (_Nr->_Flags & _Fl_greedy) != 0, _Loop_vals[_Nr->_Loop_number]._Loop_idx)) { _Failed = true; // recurse only if loop contains if/do } diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 4f7f3b1e855..f71960025ad 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -1559,6 +1559,27 @@ void test_gh_5364() { g_regexTester.should_match("c", "[^]", ECMAScript); } +void test_gh_5365() { + // GH-5365: : Implementation divergence for capture group behavior: + // Capture groups were not correctly cleared at the beginning of repetitions in ECMAScript mode. + { + test_regex captures_in_repeated_noncapturing_group(&g_regexTester, "^(?:(a)|(b)|(c)|(d))+$"); + captures_in_repeated_noncapturing_group.should_search_match_capture_groups( + "acbd", "acbd", match_default, {{-1, -1}, {-1, -1}, {-1, -1}, {3, 4}}); + captures_in_repeated_noncapturing_group.should_search_match_capture_groups( + "adcba", "adcba", match_default, {{4, 5}, {-1, -1}, {-1, -1}, {-1, -1}}); + } + { + test_regex captures_in_questionmark_quantifiers(&g_regexTester, "(z)((a+)?(b+)?(c))*"); + captures_in_questionmark_quantifiers.should_search_match_capture_groups( + "zaacbbbcac", "zaacbbbcac", match_default, {{0, 1}, {8, 10}, {8, 9}, {-1, -1}, {9, 10}}); + captures_in_questionmark_quantifiers.should_search_match_capture_groups( + "zaacbbbcbbc", "zaacbbbcbbc", match_default, {{0, 1}, {8, 11}, {-1, -1}, {8, 10}, {10, 11}}); + captures_in_questionmark_quantifiers.should_search_match_capture_groups( + "zaacbbbcabbc", "zaacbbbcabbc", match_default, {{0, 1}, {8, 12}, {8, 9}, {9, 11}, {11, 12}}); + } +} + void test_gh_5371() { // GH-5371 : \b and \B are backwards on empty strings g_regexTester.should_not_match("", R"(\b)"); @@ -1664,6 +1685,7 @@ int main() { test_gh_5253(); test_gh_5362(); test_gh_5364(); + test_gh_5365(); test_gh_5371(); test_gh_5374(); test_gh_5377(); From 7abea50e7609e4d04cfc44cbf46fdd06cf2c3dc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Thu, 1 May 2025 21:29:39 +0200 Subject: [PATCH 07/12] remove nullptr --- stl/inc/regex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index c0e32480fef..75d416b07a0 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3439,7 +3439,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Find_first_inner_capture case _N_if: { _Node_if* _Node = static_cast<_Node_if*>(_Nx); - for (; _Node != nullptr; _Node = _Node->_Child) { + for (; _Node; _Node = _Node->_Child) { if (_Find_first_inner_capture_group(_Node->_Next, _Loop_state)) { _Found_group = true; _Nx = nullptr; @@ -3447,7 +3447,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Find_first_inner_capture } } - if (_Nx != nullptr) { // continue search after the branches of the _N_if node + if (_Nx) { // continue search after the branches of the _N_if node _Nx = static_cast<_Node_if*>(_Nx)->_Endif; } break; From 31b46db3c8c4af786b609002fe3797ddc878e2a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Mon, 5 May 2025 19:22:45 +0200 Subject: [PATCH 08/12] extend test coverage --- tests/std/tests/VSO_0000000_regex_use/test.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index f71960025ad..f85dd80d647 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -1562,13 +1562,21 @@ void test_gh_5364() { void test_gh_5365() { // GH-5365: : Implementation divergence for capture group behavior: // Capture groups were not correctly cleared at the beginning of repetitions in ECMAScript mode. - { - test_regex captures_in_repeated_noncapturing_group(&g_regexTester, "^(?:(a)|(b)|(c)|(d))+$"); + for (string pattern : {"^(?:(a)|(b)|(c)|(d))+$", "^(?:(a)|(b)|(c)|(d))+?$", "^(?:(a)|(b)|(c)|(d)){4,}$"}) { + test_regex captures_in_repeated_noncapturing_group(&g_regexTester, pattern); captures_in_repeated_noncapturing_group.should_search_match_capture_groups( "acbd", "acbd", match_default, {{-1, -1}, {-1, -1}, {-1, -1}, {3, 4}}); captures_in_repeated_noncapturing_group.should_search_match_capture_groups( "adcba", "adcba", match_default, {{4, 5}, {-1, -1}, {-1, -1}, {-1, -1}}); } + + { + test_regex captures_in_repeated_noncapturing_group(&g_regexTester, "^(?:(a)|(b)|(c)|(d)){5}$"); + captures_in_repeated_noncapturing_group.should_search_fail("acbd"); + captures_in_repeated_noncapturing_group.should_search_match_capture_groups( + "adcba", "adcba", match_default, {{4, 5}, {-1, -1}, {-1, -1}, {-1, -1}}); + } + { test_regex captures_in_questionmark_quantifiers(&g_regexTester, "(z)((a+)?(b+)?(c))*"); captures_in_questionmark_quantifiers.should_search_match_capture_groups( From cf42b7f4ba5781308ac3259842d7d0e51b241c2f Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 8 May 2025 00:25:51 -0700 Subject: [PATCH 09/12] Add DMIs to `_Loop_vals_v2_t`. --- stl/inc/regex | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 75d416b07a0..1df37355872 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1494,9 +1494,9 @@ public: }; struct _Loop_vals_v2_t { // storage for loop administration - void* _Loop_iter; - int _Loop_idx; - unsigned int _Group_first; + void* _Loop_iter = nullptr; + int _Loop_idx = 0; + unsigned int _Group_first = 0; }; class _Node_rep : public _Node_base { // node that marks the beginning of a repetition From 9d5a528aca26ea0d11060c72d92489e7cfe9916c Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 8 May 2025 00:52:58 -0700 Subject: [PATCH 10/12] Extract `_Any_posix`. --- stl/inc/regex | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 1df37355872..e5e8344c1bc 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3382,10 +3382,9 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep_first(_Node_rep* // No capture group reset is performed for POSIX regexes, // so we prevent any reset by setting the first capture group to the number of capture groups _Ncap. if (_Psav->_Group_first == 0) { - if ((_Sflags - & (regex_constants::basic | regex_constants::extended | regex_constants::grep | regex_constants::egrep - | regex_constants::awk)) - || !_Find_first_inner_capture_group(_Node->_Next, _Psav)) { + constexpr auto _Any_posix = regex_constants::basic | regex_constants::extended | regex_constants::grep + | regex_constants::egrep | regex_constants::awk; + if ((_Sflags & _Any_posix) || !_Find_first_inner_capture_group(_Node->_Next, _Psav)) { _Psav->_Group_first = _Ncap; } } @@ -3762,10 +3761,10 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N if (_Tgt_state._Cur == _End) { _Failed = true; } else { + constexpr auto _Any_posix = regex_constants::basic | regex_constants::extended | regex_constants::grep + | regex_constants::egrep | regex_constants::awk; const _Elem _Ch = *_Tgt_state._Cur; - if (_Sflags - & (regex_constants::basic | regex_constants::extended | regex_constants::grep - | regex_constants::egrep | regex_constants::awk)) { + if (_Sflags & _Any_posix) { if (_Ch == _Elem()) { _Failed = true; } From f5b3ebbd84b69fe695dd3eaff453fa77bedb9a8c Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 8 May 2025 00:57:48 -0700 Subject: [PATCH 11/12] Scope `_Node_if* _Node` to the for-loop. --- stl/inc/regex | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index e5e8344c1bc..d7aa175d023 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3437,8 +3437,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Find_first_inner_capture case _N_if: { - _Node_if* _Node = static_cast<_Node_if*>(_Nx); - for (; _Node; _Node = _Node->_Child) { + for (_Node_if* _Node = static_cast<_Node_if*>(_Nx); _Node; _Node = _Node->_Child) { if (_Find_first_inner_capture_group(_Node->_Next, _Loop_state)) { _Found_group = true; _Nx = nullptr; From 2aa8ccf984a26e3330ab694731cefc7ec07ab414 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 8 May 2025 00:56:33 -0700 Subject: [PATCH 12/12] Avoid unnecessary braces in switches. --- stl/inc/regex | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index d7aa175d023..d8dca1a8199 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3418,13 +3418,11 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Find_first_inner_capture case _N_assert: case _N_neg_assert: - { - if (_Find_first_inner_capture_group(static_cast<_Node_assert*>(_Nx), _Loop_state)) { - _Found_group = true; - _Nx = nullptr; - } - break; + if (_Find_first_inner_capture_group(static_cast<_Node_assert*>(_Nx), _Loop_state)) { + _Found_group = true; + _Nx = nullptr; } + break; case _N_capture: { @@ -3436,20 +3434,18 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Find_first_inner_capture } case _N_if: - { - for (_Node_if* _Node = static_cast<_Node_if*>(_Nx); _Node; _Node = _Node->_Child) { - if (_Find_first_inner_capture_group(_Node->_Next, _Loop_state)) { - _Found_group = true; - _Nx = nullptr; - break; - } + for (_Node_if* _Node = static_cast<_Node_if*>(_Nx); _Node; _Node = _Node->_Child) { + if (_Find_first_inner_capture_group(_Node->_Next, _Loop_state)) { + _Found_group = true; + _Nx = nullptr; + break; } + } - if (_Nx) { // continue search after the branches of the _N_if node - _Nx = static_cast<_Node_if*>(_Nx)->_Endif; - } - break; + if (_Nx) { // continue search after the branches of the _N_if node + _Nx = static_cast<_Node_if*>(_Nx)->_Endif; } + break; case _N_rep: {