Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ add_benchmark(path_lexically_normal src/path_lexically_normal.cpp)
add_benchmark(priority_queue_push_range src/priority_queue_push_range.cpp)
add_benchmark(random_integer_generation src/random_integer_generation.cpp)
add_benchmark(ranges_div_ceil src/ranges_div_ceil.cpp)
add_benchmark(regex_match src/regex_match.cpp)
add_benchmark(regex_search src/regex_search.cpp)
add_benchmark(remove src/remove.cpp)
add_benchmark(replace src/replace.cpp)
Expand Down
35 changes: 35 additions & 0 deletions benchmarks/src/regex_match.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <benchmark/benchmark.h>
#include <regex>
#include <string>


using namespace std;
using namespace regex_constants;

void bm_match_sequence_of_as(benchmark::State& state, const char* pattern, syntax_option_type syntax = ECMAScript) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No change requested: The syntax is never customized, but I see that it's imitating regex_search.cpp, and I suppose it's not too confusing to leave as-is.

string input(static_cast<size_t>(state.range()), 'a');
regex re{pattern, syntax};

for (auto _ : state) {
benchmark::DoNotOptimize(input);
const char* pos = input.data();
const char* end = input.data() + input.size();
cmatch match;
regex_match(pos, end, match, re);
}
}

BENCHMARK_CAPTURE(bm_match_sequence_of_as, "a*", "a*")->Arg(100)->Arg(200)->Arg(400);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "a*?", "a*?")->Arg(100)->Arg(200)->Arg(400);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(?:a)*", "(?:a)*")->Arg(100)->Arg(200)->Arg(400);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)*", "(a)*")->Arg(100)->Arg(200)->Arg(400);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(?:b|a)*", "(?:b|a)*")->Arg(100)->Arg(200)->Arg(400);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(b|a)*", "(b|a)*")->Arg(100)->Arg(200)->Arg(400);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)(?:b|a)*", "(a)(?:b|a)*")->Arg(100)->Arg(200)->Arg(400);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)(b|a)*", "(a)(b|a)*")->Arg(100)->Arg(200)->Arg(400);
BENCHMARK_CAPTURE(bm_match_sequence_of_as, "(a)(?:b|a)*c", "(a)(?:b|a)*c")->Arg(100)->Arg(200)->Arg(400);

BENCHMARK_MAIN();
81 changes: 59 additions & 22 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -1681,6 +1681,8 @@ enum class _Rx_unwind_ops {
_Loop_nongreedy,
_Loop_greedy,
_Loop_restore_vals,
_Capture_restore_begin,
_Capture_restore_end
};

template <class _BidIt>
Expand All @@ -1689,7 +1691,7 @@ public:
_Rx_unwind_ops _Code;
int _Loop_idx_sav;
_Node_base* _Node;
_Tgt_state_t<_BidIt> _Match_state;
_Bt_state_t<_BidIt> _Match_state;
size_t _Loop_frame_idx_sav;
};

Expand Down Expand Up @@ -3919,25 +3921,40 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
}

case _N_end_assert:
for (;;) {
--_Frames_count;
const auto& _Frame = _Frames[_Frames_count];
const auto _Code = _Frame._Code;
if (_Code == _Rx_unwind_ops::_After_assert || _Code == _Rx_unwind_ops::_After_neg_assert) {
_Tgt_state._Cur = _Frame._Match_state._Cur;
_Decrease_stack_usage_count();
if (_Code == _Rx_unwind_ops::_After_assert) {
_Next = _Frame._Node->_Next;
} else {
_Failed = true;
{
size_t _Last_capture_restore_frame = 0U;
for (;;) {
--_Frames_count;
const auto& _Frame = _Frames[_Frames_count];
const auto _Code = _Frame._Code;
if (_Code == _Rx_unwind_ops::_After_assert || _Code == _Rx_unwind_ops::_After_neg_assert) {
_Tgt_state._Cur = _Frame._Match_state._Cur;
_Decrease_stack_usage_count();
if (_Code == _Rx_unwind_ops::_After_assert) {
_Next = _Frame._Node->_Next;
if (_Last_capture_restore_frame != 0U) {
_Frames_count = static_cast<size_t>(
_STD remove_if(_Frames.begin() + static_cast<ptrdiff_t>(_Frames_count),
_Frames.begin() + static_cast<ptrdiff_t>(_Last_capture_restore_frame) + 1,
[](const auto& _Other_frame) {
return _Other_frame._Code != _Rx_unwind_ops::_Capture_restore_begin
&& _Other_frame._Code != _Rx_unwind_ops::_Capture_restore_end;
})
- _Frames.begin());
}
} else {
_Failed = true;
}
break;
} else if (_Code == _Rx_unwind_ops::_Disjunction_eval_alt_on_failure
|| _Code == _Rx_unwind_ops::_Disjunction_eval_alt_always
|| _Code == _Rx_unwind_ops::_Loop_greedy //
|| _Code == _Rx_unwind_ops::_Loop_nongreedy
|| _Code == _Rx_unwind_ops::_Loop_restore_vals) {
_Decrease_stack_usage_count();
} else if (_Code == _Rx_unwind_ops::_Capture_restore_end && _Last_capture_restore_frame == 0U) {
_Last_capture_restore_frame = _Frames_count;
}
break;
} else if (_Code == _Rx_unwind_ops::_Disjunction_eval_alt_on_failure
|| _Code == _Rx_unwind_ops::_Disjunction_eval_alt_always
|| _Code == _Rx_unwind_ops::_Loop_greedy //
|| _Code == _Rx_unwind_ops::_Loop_nongreedy
|| _Code == _Rx_unwind_ops::_Loop_restore_vals) {
_Decrease_stack_usage_count();
}
}
break;
Expand All @@ -3946,7 +3963,10 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
{ // record current position
_Node_capture* _Node = static_cast<_Node_capture*>(_Nx);
if (_Node->_Idx != 0U) {
_Tgt_state._Grps[_Node->_Idx]._Begin = _Tgt_state._Cur;
auto& _Group = _Tgt_state._Grps[_Node->_Idx];
auto _Frame_idx = _Push_frame(_Rx_unwind_ops::_Capture_restore_begin, _Node);
_Frames[_Frame_idx]._Match_state._Cur = _Group._Begin;
_Group._Begin = _Tgt_state._Cur;
Comment on lines +3969 to +3970
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No change requested: I observe that _STD exchange could be used for this code pattern, although we don't universally use it. (This also occurs immediately below for _N_end_capture.)

}
break;
}
Expand All @@ -3956,8 +3976,11 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
_Node_end_group* _Node = static_cast<_Node_end_group*>(_Nx);
_Node_capture* _Node0 = static_cast<_Node_capture*>(_Node->_Back);
if (_Node0->_Idx != 0U) { // update capture data
_Tgt_state._Grp_valid[_Node0->_Idx] = true;
_Tgt_state._Grps[_Node0->_Idx]._End = _Tgt_state._Cur;
auto& _Group = _Tgt_state._Grps[_Node0->_Idx];
auto _Frame_idx = _Push_frame(_Rx_unwind_ops::_Capture_restore_end, _Node0);
_Frames[_Frame_idx]._Match_state._Cur = _Group._End;
_Tgt_state._Grp_valid[_Node0->_Idx] = true;
_Group._End = _Tgt_state._Cur;
}
break;
}
Expand Down Expand Up @@ -4325,6 +4348,20 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
}
break;

case _Rx_unwind_ops::_Capture_restore_begin:
{ // restore begin of capturing group
auto _Node = static_cast<_Node_capture*>(_Frame._Node);
_Tgt_state._Grps[_Node->_Idx]._Begin = _Frame._Match_state._Cur;
}
break;

case _Rx_unwind_ops::_Capture_restore_end:
{ // restore end of capturing group
auto _Node = static_cast<_Node_capture*>(_Frame._Node);
_Tgt_state._Grps[_Node->_Idx]._End = _Frame._Match_state._Cur;
}
break;

default:
#if _ITERATOR_DEBUG_LEVEL != 0
_STL_REPORT_ERROR("internal stack of regex matcher corrupted");
Expand Down
10 changes: 10 additions & 0 deletions tests/std/tests/VSO_0000000_regex_use/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2350,6 +2350,15 @@ void test_gh_5798() {
}
}

void test_gh_5865() {
// GH-5865: <regex>: Remove capture extent vectors from stack frames
// These tests check correct restoration of capturing groups
// when backtracking over positive lookahead assertions that matched successfully.
g_regexTester.should_capture("ab", "(?:(?=(.*))ab)*", "ab");
g_regexTester.should_capture("abcd", "(?:(?=(.*))ab)*cd", "abcd");
g_regexTester.should_capture("abab", "(?:(?=(.*))ab)*ab", "abab");
}

int main() {
test_dev10_449367_case_insensitivity_should_work();
test_dev11_462743_regex_collate_should_not_disable_regex_icase();
Expand Down Expand Up @@ -2407,6 +2416,7 @@ int main() {
test_gh_5792();
test_gh_5797();
test_gh_5798();
test_gh_5865();

return g_regexTester.result();
}