diff --git a/stl/inc/regex b/stl/inc/regex index b2023ebd87c..e5bd9bafeb8 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1537,7 +1537,7 @@ public: void _Add_char2(_Elem _Ch); void _Add_class(); void _Add_char_to_class(_Elem _Ch); - void _Add_range2(_Elem, _Elem); + void _Add_range3(_Elem, _Elem); void _Add_named_class(typename _RxTraits::char_class_type, _Rx_char_class_kind); void _Add_equiv2(const _Elem*, const _Elem*); void _Add_coll2(const _Elem*, const _Elem*); @@ -1567,11 +1567,8 @@ private: _Node_base* _Current; regex_constants::syntax_option_type _Flags; const _RxTraits& _Traits; - const int _Bmax; // Do not use; use _Get_bmax instead. - const int _Tmax; // Do not use; use _Get_tmax instead. - - unsigned int _Get_bmax() const; - unsigned int _Get_tmax() const; + const int _Bmax; // TRANSITION, ABI: preserved for binary compatibility + const int _Tmax; // TRANSITION, ABI: preserved for binary compatibility public: _Builder& operator=(const _Builder&) = delete; @@ -2911,33 +2908,61 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_char_to_class(_Elem _Ch) { // add } template -void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_range2(const _Elem _Arg0, const _Elem _Arg1) { +void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_range3(const _Elem _Arg0, const _Elem _Arg1) { // add character range to set + using string_type = typename _RxTraits::string_type; unsigned int _Ex0 = static_cast(_Arg0); const unsigned int _Ex1 = static_cast(_Arg1); _Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Current); - for (; _Ex0 <= _Ex1 && _Ex1 < _Get_bmax(); ++_Ex0) { // set a bit - if (!_Node->_Small) { + // set bits and check that the range is non-empty + if (_Flags & regex_constants::collate) { + _Elem _Ch; + const auto _Ch_ptr = _STD addressof(_Ch); + const auto _Arg0_ptr = _STD addressof(_Arg0); + const auto _Arg1_ptr = _STD addressof(_Arg1); + const string_type _Arg0_key = _Traits.transform(_Arg0_ptr, _Arg0_ptr + 1); + const string_type _Arg1_key = _Traits.transform(_Arg1_ptr, _Arg1_ptr + 1); + + if (_Arg0_key > _Arg1_key) { + _Xregex_error(regex_constants::error_range); + } + + for (unsigned int _UCh = 0; _UCh < _Bmp_max; ++_UCh) { + _Ch = static_cast<_Elem>(_UCh); + const string_type _Ch_key = _Traits.transform(_Ch_ptr, _Ch_ptr + 1); + if (_Arg0_key <= _Ch_key && _Ch_key <= _Arg1_key) { + if (!_Node->_Small) { + _Node->_Small = new _Bitmap; + } + _Node->_Small->_Mark(_UCh); + } + } + } else if (_Ex0 > _Ex1) { + _Xregex_error(regex_constants::error_range); + } else { + if (!_Node->_Small && _Ex0 < _Bmp_max) { _Node->_Small = new _Bitmap; } - _Node->_Small->_Mark(_Ex0); - } + for (; _Ex0 <= _Ex1 && _Ex0 < _Bmp_max; ++_Ex0) { + _Node->_Small->_Mark(_Ex0); + } - if ((_Flags & regex_constants::collate) || _Ex1 >= _Ex0) { - if (_Ex1 - _Ex0 < _Get_tmax()) { + if (_Ex1 - _Ex0 < _ARRAY_THRESHOLD) { for (; _Ex0 <= _Ex1; ++_Ex0) { _Add_char_to_array(static_cast<_Elem>(_Ex0)); } - } else { // store remaining range as pair - if (!_Node->_Ranges) { - _Node->_Ranges = new _Buf<_Elem>; - } + } + } - _Node->_Ranges->_Insert2(static_cast<_Elem>(_Ex0)); - _Node->_Ranges->_Insert2(_Arg1); + if ((_Flags & regex_constants::collate) || _Ex1 >= _Ex0) { // store remaining range as pair + if (!_Node->_Ranges) { + _Node->_Ranges = new _Buf<_Elem>; } + + _Node->_Ranges->_Insert2(static_cast<_Elem>(_Ex0)); + _Node->_Ranges->_Insert2(_Arg1); } } @@ -2991,16 +3016,6 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Char_to_elts2(const _Elem* const _Firs (*_Cur)->_Data._Insert2(_First, _Last); } -template -unsigned int _Builder<_FwdIt, _Elem, _RxTraits>::_Get_bmax() const { - return static_cast(_Bmax); -} - -template -unsigned int _Builder<_FwdIt, _Elem, _RxTraits>::_Get_tmax() const { - return static_cast(_Tmax); -} - template void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_equiv2(const _Elem* const _First, const _Elem* const _Last) { // add elements of equivalence class to bracket expression @@ -4328,18 +4343,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_ClassRanges() { // check for valid clas _Chr2 = _Traits.translate(_Chr2); } - if (_Flags & regex_constants::collate) { - const _Elem* const _Chr1_ptr = _STD addressof(_Chr1); - const _Elem* const _Chr2_ptr = _STD addressof(_Chr2); - if (_Traits.transform(_Chr2_ptr, _Chr2_ptr + 1) < _Traits.transform(_Chr1_ptr, _Chr1_ptr + 1)) { - _Error(regex_constants::error_range); - } - } else if (static_cast(_Chr2) - < static_cast(_Chr1)) { - _Error(regex_constants::error_range); - } - - _Nfa._Add_range2(_Chr1, _Chr2); + _Nfa._Add_range3(_Chr1, _Chr2); } else if (_Ret == _Prs_chr) { _Nfa._Add_char_to_class(static_cast<_Elem>(_Val)); } diff --git a/tests/std/tests/GH_005204_regex_collating_ranges/test.cpp b/tests/std/tests/GH_005204_regex_collating_ranges/test.cpp index c7c017c9778..b24141a9fc3 100644 --- a/tests/std/tests/GH_005204_regex_collating_ranges/test.cpp +++ b/tests/std/tests/GH_005204_regex_collating_ranges/test.cpp @@ -562,9 +562,37 @@ void test_gh_994() { #endif // !defined(SKIP_COLLATE_TESTS) } +void test_gh_5437_ECMAScript_or_collate(syntax_option_type ECMAScript_or_collate) { + { + test_wregex char_range(&g_regexTester, L"^[\u0001-\u0200]$", ECMAScript_or_collate); + for (wchar_t ch = L'\u0001'; ch <= L'\u0200'; ++ch) { + char_range.should_search_match(wstring(1, ch), wstring(1, ch)); + } + char_range.should_search_fail(wstring(1, L'\u0000')); + char_range.should_search_fail(wstring(1, L'\u0201')); + } + { + test_wregex char_range(&g_regexTester, L"^[\u00FE-\u0100]$", ECMAScript_or_collate); + for (wchar_t ch = L'\u00FE'; ch <= L'\u0100'; ++ch) { + char_range.should_search_match(wstring(1, ch), wstring(1, ch)); + } + char_range.should_search_fail(wstring(1, L'\u00FD')); + char_range.should_search_fail(wstring(1, L'\u0101')); + } +} + +void test_gh_5437() { + // GH-5437: make `wregex` handle small character ranges containing U+00FF and U+0100 correctly + test_gh_5437_ECMAScript_or_collate(ECMAScript); +#ifndef SKIP_COLLATE_TESTS + test_gh_5437_ECMAScript_or_collate(regex_constants::collate); +#endif // !defined(SKIP_COLLATE_TESTS) +} + int main() { test_collating_ranges_german(); test_gh_994(); + test_gh_5437(); return g_regexTester.result(); }