diff --git a/stl/inc/regex b/stl/inc/regex index 3753efd523d..3c337580e7b 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -327,7 +327,6 @@ inline size_t _Regex_transform_primary(_Out_writes_(_Last1 - _First1) _Post_read template class _Regex_traits : public _Regex_traits_base { // base class for regular expression traits public: - using _Uelem = make_unsigned_t<_Elem>; using char_type = _Elem; using size_type = size_t; using string_type = basic_string<_Elem>; @@ -638,11 +637,9 @@ inline bool _Is_word(char _Ch) { template bool _Is_word(_Elem _Ch) { - // assumes 'x' == L'x' for the ASCII range - using _UElem = make_unsigned_t<_Elem>; - - const auto _UCh = static_cast<_UElem>(_Ch); - return _UCh <= static_cast<_UElem>('z') && _Is_word(static_cast(_UCh)); + // assumes that ASCII characters are represented at ASCII code points + const auto _Uchar = static_cast(_Ch); + return static_cast<_Elem>(_Uchar) == _Ch && _Uchar <= 'z' && _Is_word(_Uchar); } _EXPORT_STD template @@ -3727,11 +3724,23 @@ _BidIt1 _Search_translate_left(_BidIt1 _Begin1, _BidIt1 _End1, _BidIt2 _Begin2, } } -template -bool _Lookup_range(unsigned int _Ch, const _Buf<_Elem>* _Bufptr) { // check whether _Ch is in _Buf - using _Uelem = make_unsigned_t<_Elem>; +template +bool _Lookup_range(const _Elem _Ch, const _Buf<_Elem>* const _Bufptr) { // check whether _Ch is in _Buf for (unsigned int _Ix = 0; _Ix < _Bufptr->_Size(); _Ix += 2) { // check current position - if (static_cast<_Uelem>(_Bufptr->_At(_Ix)) <= _Ch && _Ch <= static_cast<_Uelem>(_Bufptr->_At(_Ix + 1))) { + const _Elem _Left = _Bufptr->_At(_Ix); + const _Elem _Right = _Bufptr->_At(_Ix + 1); + + bool _Within_range; + if constexpr (is_same_v<_Char_traits, char_traits>) { + const auto _Uchar = static_cast(_Ch); + _Within_range = static_cast(_Left) <= _Uchar && _Uchar <= static_cast(_Right); + } else if constexpr (is_same_v<_Char_traits, char_traits>) { + _Within_range = _Left <= _Ch && _Ch <= _Right; + } else { + _Within_range = !_Char_traits::lt(_Ch, _Left) && !_Char_traits::lt(_Right, _Ch); + } + + if (_Within_range) { return true; } } @@ -3829,7 +3838,7 @@ _It _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_class(_Node_base* _Nx, } else if (_Sflags & regex_constants::collate) { _Ch = _Traits.translate(_Ch); } - const auto _UCh = static_cast(_Ch); + const auto _Uchar = static_cast(_Ch); _It _Res0 = _First; ++_Res0; @@ -3841,11 +3850,12 @@ _It _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_class(_Node_base* _Nx, _Res0 = _Resx; _Found = true; } else if (_Node->_Ranges - && (_Sflags & regex_constants::collate ? _STD _Lookup_collating_range(_Ch, _Node->_Ranges, _Traits) - : _STD _Lookup_range(_UCh, _Node->_Ranges))) { + && (_Sflags & regex_constants::collate + ? _STD _Lookup_collating_range(_Ch, _Node->_Ranges, _Traits) + : _STD _Lookup_range(_Ch, _Node->_Ranges))) { _Found = true; - } else if (_UCh < _Bmp_max) { - _Found = _Node->_Small && _Node->_Small->_Find(_UCh); + } else if (static_cast<_Elem>(_Uchar) == _Ch) { + _Found = _Node->_Small && _Node->_Small->_Find(_Uchar); } else if (_Node->_Large && _STD find(_Node->_Large->_Str(), _Node->_Large->_Str() + _Node->_Large->_Size(), _Ch) != _Node->_Large->_Str() + _Node->_Large->_Size()) { @@ -3913,15 +3923,15 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Is_wbound() const { if ((_Mflags & regex_constants::match_prev_avail) || _Tgt_state._Cur != _Begin) { // if --_Cur is valid, check for preceding word character if (_Tgt_state._Cur == _End) { - return (_Mflags & regex_constants::match_not_eow) == 0 && _Is_word(*_Prev_iter(_Tgt_state._Cur)); + return (_Mflags & regex_constants::match_not_eow) == 0 && _STD _Is_word(*_Prev_iter(_Tgt_state._Cur)); } else { - return _Is_word(*_Prev_iter(_Tgt_state._Cur)) != _Is_word(*_Tgt_state._Cur); + return _STD _Is_word(*_Prev_iter(_Tgt_state._Cur)) != _STD _Is_word(*_Tgt_state._Cur); } } else { // --_Cur is not valid if (_Tgt_state._Cur == _End) { return false; } else { - return (_Mflags & regex_constants::match_not_bow) == 0 && _Is_word(*_Tgt_state._Cur); + return (_Mflags & regex_constants::match_not_bow) == 0 && _STD _Is_word(*_Tgt_state._Cur); } } } @@ -3936,7 +3946,15 @@ typename _RxTraits::char_class_type _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Al template bool _Is_ecmascript_line_terminator(_Elem _Ch) { - return _Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps; + if constexpr (is_same_v<_Elem, char>) { + return _Ch == _Meta_nl || _Ch == _Meta_cr; + } else if constexpr (is_same_v<_Elem, wchar_t>) { + return _Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps; + } else { + auto _UCh = static_cast(_Ch); + return static_cast<_Elem>(_UCh) == _Ch + && (_UCh == _Meta_nl || _UCh == _Meta_cr || _UCh == _Meta_ls || _UCh == _Meta_ps); + } } template diff --git a/tests/std/tests/GH_000995_regex_custom_char_types/env.lst b/tests/std/tests/GH_000995_regex_custom_char_types/env.lst index f141421b292..19f025bd0e6 100644 --- a/tests/std/tests/GH_000995_regex_custom_char_types/env.lst +++ b/tests/std/tests/GH_000995_regex_custom_char_types/env.lst @@ -1,4 +1,4 @@ # Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -RUNALL_INCLUDE ..\impure_matrix.lst +RUNALL_INCLUDE ..\usual_matrix.lst diff --git a/tests/std/tests/GH_000995_regex_custom_char_types/test.cpp b/tests/std/tests/GH_000995_regex_custom_char_types/test.cpp index c9409259f64..a057af8f9cb 100644 --- a/tests/std/tests/GH_000995_regex_custom_char_types/test.cpp +++ b/tests/std/tests/GH_000995_regex_custom_char_types/test.cpp @@ -1,24 +1,85 @@ // Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// TRANSITION, GH-5563 -#pragma warning(push) -#pragma warning(disable : 6510) -#include -#pragma warning(pop) - #include +#include #include +#include #include +#include #include +#include +#include using namespace std; +enum class signed_wchar_enum : short {}; +enum class ullong_enum : unsigned long long {}; + +template +T convert_to(const signed_wchar_enum& char_enum) { + return static_cast(char_enum); +} + +template +T convert_to(const ullong_enum& char_enum) { + return static_cast(char_enum); +} + +template +class wrapped_character { +public: + wrapped_character() = default; + explicit wrapped_character(char ch) : character(static_cast(ch)) {} + explicit wrapped_character(unsigned char ch) : character(ch) {} + explicit wrapped_character(Elem w) : character(w) {} + template , int> = 0> + explicit wrapped_character(wchar_t w) : character(w) {} + explicit wrapped_character(int w) = delete; + explicit wrapped_character(unsigned int w) : character(static_cast(w)) {} + + operator unsigned char() const { + return static_cast(character); + } + + operator unsigned int() const { + return static_cast(character); + } + + operator char() const = delete; + operator wchar_t() const = delete; + operator int() const = delete; + operator unsigned long long() const = delete; + + friend bool operator==(const wrapped_character& lhs, const wrapped_character& rhs) { + return lhs.character == rhs.character; + } + + template + friend T convert_to(const wrapped_character& wrapped_char) { + return static_cast(wrapped_char.character); + } + +private: + Elem character; +}; + +template +T convert_to(const wrapped_character& wrapped_char); + +template +bool operator!=(const wrapped_character& lhs, const wrapped_character& rhs) { + return !(lhs == rhs); +} + +using wrapped_wchar = wrapped_character; +using wrapped_ullong = wrapped_character; + template basic_string convert_to_underlying_string(FwdIt first, FwdIt last) { basic_string str; for (; first != last; ++first) { - str.push_back(static_cast(*first)); + str.push_back(convert_to(*first)); } return str; } @@ -54,7 +115,7 @@ class test_regex_traits { } SourceChar translate_nocase(const SourceChar c) const { - return static_cast(inner.translate_nocase(static_cast(c))); + return static_cast(inner.translate_nocase(convert_to(c))); } template @@ -65,8 +126,10 @@ class test_regex_traits { template string_type transform_primary(FwdIt first, FwdIt last) const { - auto str = convert_to_underlying_string(first, last); - return convert_from_underlying_string(inner.transform_primary(str.begin(), str.end())); + if (distance(first, last) == 1) { + return string_type{first, last}; + } + return string_type{}; } template @@ -82,11 +145,11 @@ class test_regex_traits { } bool isctype(SourceChar c, char_class_type f) const { - return inner.isctype(static_cast(c), f); + return inner.isctype(convert_to(c), f); } int value(SourceChar ch, int radix) const { - return inner.value(static_cast(ch), radix); + return inner.value(convert_to(ch), radix); } locale_type imbue(locale_type l) { @@ -101,52 +164,13 @@ class test_regex_traits { rx_traits inner; }; -enum class signed_wchar_enum : short {}; - -class wrapped_wchar { -public: - wrapped_wchar() = default; - explicit wrapped_wchar(char ch) : character(static_cast(ch)) {} - explicit wrapped_wchar(unsigned char ch) : character(ch) {} - explicit wrapped_wchar(wchar_t w) : character(w) {} - explicit wrapped_wchar(int w) = delete; - explicit wrapped_wchar(unsigned int w) : character(static_cast(w)) {} - - operator unsigned char() const { - return static_cast(character); - } - - operator unsigned int() const { - return static_cast(character); - } - - // to support test_regex_traits - operator wchar_t() const { - return character; - } - - operator char() const = delete; - operator int() const = delete; - - friend bool operator==(const wrapped_wchar& lhs, const wrapped_wchar& rhs) { - return lhs.character == rhs.character; - } - -private: - wchar_t character; -}; - -bool operator!=(const wrapped_wchar& lhs, const wrapped_wchar& rhs) { - return !(lhs == rhs); -} - -template +template struct custom_char_traits { using char_type = Elem; - using int_type = int; + using int_type = conditional_t; using pos_type = streampos; using off_type = streamoff; - using state_type = char_traits::state_type; + using state_type = mbstate_t; static Elem* copy(Elem* const first1, const Elem* const first2, const size_t count) noexcept { copy_n(first2, count, first1); @@ -213,7 +237,7 @@ struct custom_char_traits { } static bool lt(const Elem left, const Elem right) noexcept { - return static_cast(left) < static_cast(right); + return convert_to(left) < convert_to(right); } static Elem to_char_type(const int_type meta) noexcept { @@ -238,13 +262,20 @@ struct custom_char_traits { }; template <> -struct char_traits : custom_char_traits {}; +struct char_traits : custom_char_traits {}; + +template <> +struct char_traits : custom_char_traits {}; + +template <> +struct char_traits : custom_char_traits {}; template <> -struct char_traits : custom_char_traits {}; +struct char_traits : custom_char_traits {}; + void test_gh_5592() { - // GH-5592: Remove _Uelem from the parser + // GH-5592: Remove non-standard _Uelem from parser // This test checks that the parser compiles and doesn't crash // when user-defined character types are used. @@ -263,7 +294,145 @@ void test_gh_5592() { } } +template +void test_regex_for_custom_char(const wstring& pattern, const wstring& input, bool should_match) { + auto converted_pattern = convert_from_underlying_string>(pattern); + auto converted_input = convert_from_underlying_string>(input); + basic_regex re{converted_pattern}; + assert(regex_match(converted_input, re) == should_match); +} + +void test_regex_on_custom_wchars(const wstring& pattern, const wstring& input, bool should_match) { + test_regex_for_custom_char>( + pattern, input, should_match); + test_regex_for_custom_char>(pattern, input, should_match); +} + +template +void test_regex_for_custom_char(const UnderlyingChar* pat_first, const UnderlyingChar* pat_last, + const UnderlyingChar* input_first, const UnderlyingChar* input_last, bool should_match) { + basic_string converted_pattern; + basic_string converted_input; + transform(pat_first, pat_last, back_inserter(converted_pattern), + [](UnderlyingChar val) { return static_cast(val); }); + transform(input_first, input_last, back_inserter(converted_input), + [](UnderlyingChar val) { return static_cast(val); }); + basic_regex re{converted_pattern}; + assert(regex_match(converted_input, re) == should_match); +} + +void test_regex_on_custom_ullongs(const unsigned long long* pat_first, const unsigned long long* pat_last, + const unsigned long long* input_first, const unsigned long long* input_last, bool should_match) { + test_regex_for_custom_char>( + pat_first, pat_last, input_first, input_last, should_match); + test_regex_for_custom_char>( + pat_first, pat_last, input_first, input_last, should_match); +} + +void test_gh_5671_single_character_patterns() { + // test patterns matching single characters + // simple smoke test + test_regex_on_custom_wchars(L"a", L"a", true); + test_regex_on_custom_wchars(L"a", L"b", false); + + // no truncation issues at 0x100 + test_regex_on_custom_wchars(L"\u0100", L"\u0100", true); // U+0100 LATIN CAPITAL LETTER A WITH MACRON + test_regex_on_custom_wchars(wstring(1, L'\0'), L"\u0100", false); // U+0100 LATIN CAPITAL LETTER A WITH MACRON + test_regex_on_custom_wchars(L"\u0100", wstring(1, L'\0'), false); // U+0100 LATIN CAPITAL LETTER A WITH MACRON + + // no issues with signed character values + test_regex_on_custom_wchars( + L"\uf000", L"\uf000", true); // U+F000 in private use area (signed for signed_wchar_enum) + + // matching for very large values + { + const unsigned long long pattern = 0xabababababULL; + + test_regex_on_custom_ullongs(&pattern, &pattern + 1, &pattern, &pattern + 1, true); + + for (const unsigned long long unmatched : {0xabULL, 0xababULL, 0xababababULL, 0xababababaaULL, 0xababababacULL, + 0x9bababababULL, 0xbbababababULL, 0xffffffababababULL}) { + test_regex_on_custom_ullongs(&pattern, &pattern + 1, &unmatched, &unmatched + 1, false); + } + } +} + +void test_gh_5671_line_terminators() { + // test line terminator handling for dot + test_regex_on_custom_wchars(L".", L"a", true); + test_regex_on_custom_wchars(L".", L"\r", false); + test_regex_on_custom_wchars(L".", L"\n", false); + test_regex_on_custom_wchars(L".", L"\u2028", false); // U+2028 LINE SEPARATOR + test_regex_on_custom_wchars(L".", L"\u2029", false); // U+2029 PARAGRAPH SEPARATOR + test_regex_on_custom_wchars(L".", L"\u2c60", true); // U+2C60 LATIN CAPITAL LETTER L WITH DOUBLE BAR + + { + const unsigned long long dot = L'.'; + + for (const unsigned long long line_terminator : {L'\r', L'\n', L'\u2028', L'\u2029'}) { + test_regex_on_custom_ullongs(&dot, &dot + 1, &line_terminator, &line_terminator + 1, false); + + const unsigned long long shifted_line_terminator = 0x100000000ULL + line_terminator; + test_regex_on_custom_ullongs(&dot, &dot + 1, &shifted_line_terminator, &shifted_line_terminator + 1, true); + } + } +} + +void test_gh_5671_word_boundaries() { + // test word boundaries + test_regex_on_custom_wchars(LR"(a\b.)", L"a\u0141", true); // U+0141 LATIN CAPITAL LETTER L WITH STROKE + test_regex_on_custom_wchars(LR"(a\B.)", L"a\u0141", false); // U+0141 LATIN CAPITAL LETTER L WITH STROKE +} + +void test_gh_5671_character_ranges() { + // test simple ranges + test_regex_on_custom_wchars(LR"([b-\u0141])", L"a", false); // U+0141 LATIN CAPITAL LETTER L WITH STROKE + test_regex_on_custom_wchars(LR"([b-\u0141])", L"b", true); + test_regex_on_custom_wchars(LR"([b-\u0141])", L"\u0141", true); + test_regex_on_custom_wchars(LR"([b-\u0141])", L"\u0142", false); // U+0142 LATIN SMALL LETTER L WITH STROKE + + // test that GH-5437 fix for small ranges near U+0100 remains in place for custom types + test_regex_on_custom_wchars(LR"([\u00ff-\u0100])", L"\u00FE", false); // U+00FE LATIN SMALL LETTER THORN + test_regex_on_custom_wchars(LR"([\u00ff-\u0100])", L"\u00FF", true); // U+00FF LATIN SMALL LETTER Y WITH DIAERESIS + test_regex_on_custom_wchars(LR"([\u00ff-\u0100])", L"\u0100", true); // U+0100 LATIN CAPITAL LETTER A WITH MACRON + test_regex_on_custom_wchars(LR"([\u00ff-\u0100])", L"\u0101", false); // U+0101 LATIN SMALL LETTER A WITH MACRON + + // test ranges with negative upper boundary when signed + test_regex_on_custom_wchars( + LR"([\u7fed-\u8123])", L"\u7fed", true); // U+7FED CJK UNIFIED IDEOGRAPH-7FED, U+8123 CJK UNIFIED IDEOGRAPH-8123 + test_regex_on_custom_wchars(LR"([\u7fed-\u8123])", L"\u8123", true); + test_regex_on_custom_wchars(LR"([\u7fed-\u8123])", L"\u8001", true); // U+8001 CJK UNIFIED IDEOGRAPH-8001 + test_regex_on_custom_wchars(LR"([\u7fed-\u8123])", L"\u7fec", false); // U+7FEC CJK UNIFIED IDEOGRAPH-7FEC + test_regex_on_custom_wchars(LR"([\u7fed-\u8123])", L"\u8124", false); // U+8124 CJK UNIFIED IDEOGRAPH-8124 + + // test ranges with boundaries exceeding UINT_MAX for small and large ranges + for (unsigned long long upper_bound : {0x100000006ULL, 0x100000036ULL}) { + const unsigned long long pattern[] = {L'[', 0x100000004ULL, L'-', upper_bound, L']'}; + for (unsigned long long matched = 0x100000004ULL; matched <= upper_bound; ++matched) { + test_regex_on_custom_ullongs(begin(pattern), end(pattern), &matched, &matched + 1, true); + } + + for (const unsigned long long unmatched : + {0x00000004ULL, upper_bound & 0xffffffffULL, 0x100000003ULL, upper_bound + 1ULL}) { + test_regex_on_custom_ullongs(begin(pattern), end(pattern), &unmatched, &unmatched + 1, false); + } + } +} + +void test_gh_5671() { + // GH-5671: Remove non-standard _Uelem from matcher + // + // The following tests check that internal narrowing conversions to integers in parser and matcher + // as well as signedness of the character type are handled appropriately + // even when using (weird) custom character types. + test_gh_5671_single_character_patterns(); + test_gh_5671_line_terminators(); + test_gh_5671_word_boundaries(); + test_gh_5671_character_ranges(); +} + int main() { test_gh_5592(); + test_gh_5671(); return 0; } diff --git a/tests/std/tests/GH_005244_regex_escape_sequences/test.cpp b/tests/std/tests/GH_005244_regex_escape_sequences/test.cpp index 7e66ca640a5..ba4b87d6d5f 100644 --- a/tests/std/tests/GH_005244_regex_escape_sequences/test.cpp +++ b/tests/std/tests/GH_005244_regex_escape_sequences/test.cpp @@ -24,9 +24,6 @@ class test_regex_traits { using locale_type = typename rx_traits::locale_type; using char_class_type = typename rx_traits::char_class_type; - // TRANSITION, GH-995 - using _Uelem = typename rx_traits::_Uelem; - test_regex_traits() = default; static size_t length(const charT* p) { diff --git a/tests/std/tests/GH_005553_regex_character_translation/test.cpp b/tests/std/tests/GH_005553_regex_character_translation/test.cpp index 33c5521ad49..5eadb05693e 100644 --- a/tests/std/tests/GH_005553_regex_character_translation/test.cpp +++ b/tests/std/tests/GH_005553_regex_character_translation/test.cpp @@ -26,9 +26,6 @@ class nonidempotent_translate_regex_traits : private regex_traits { using char_class_type = typename rx_traits::char_class_type; using uchar_type = make_unsigned_t; - // TRANSITION, GH-995 - using _Uelem = typename rx_traits::_Uelem; - nonidempotent_translate_regex_traits() = default; using rx_traits::length;