From b3c1f882f0287e049ac3a83de2decd8c9b959c85 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Fri, 14 Apr 2023 06:34:41 +0200 Subject: [PATCH 1/5] Move `from_utf8(state, byte)` into .cpp file Signed-off-by: Christian Parpart --- src/libunicode/CMakeLists.txt | 1 + src/libunicode/utf8.cpp | 66 +++++++++++++++++++++++++++++++++++ src/libunicode/utf8.h | 47 +------------------------ 3 files changed, 68 insertions(+), 46 deletions(-) create mode 100644 src/libunicode/utf8.cpp diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index da490c7..2b9ed3f 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -89,6 +89,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE} grapheme_segmenter.cpp scan.cpp script_segmenter.cpp + utf8.cpp width.cpp # auto-generated by unicode_tablgen diff --git a/src/libunicode/utf8.cpp b/src/libunicode/utf8.cpp new file mode 100644 index 0000000..5cadd96 --- /dev/null +++ b/src/libunicode/utf8.cpp @@ -0,0 +1,66 @@ +/** + * This file is part of the "libunicode" project + * Copyright (c) 2020 Christian Parpart + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +namespace unicode +{ + +ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte) +{ + if (!_state.expectedLength) + { + if ((_byte & 0b1000'0000) == 0) + { + _state.currentLength = 1; + return Success { _byte }; + } + else if ((_byte & 0b1110'0000) == 0b1100'0000) + { + _state.currentLength = 1; + _state.expectedLength = 2; + _state.character = _byte & 0b0001'1111; + } + else if ((_byte & 0b1111'0000) == 0b1110'0000) + { + _state.currentLength = 1; + _state.expectedLength = 3; + _state.character = _byte & 0b0000'1111; + } + else if ((_byte & 0b1111'1000) == 0b1111'0000) + { + _state.currentLength = 1; + _state.expectedLength = 4; + _state.character = _byte & 0b0000'0111; + } + else + { + _state.currentLength = 1; + return Invalid {}; + } + } + else + { + _state.character <<= 6; + _state.character |= _byte & 0b0011'1111; + _state.currentLength++; + } + + if (_state.currentLength < _state.expectedLength) + return { Incomplete {} }; + + _state.expectedLength = 0; // reset state + return { Success { _state.character } }; +} + +} // namespace unicode diff --git a/src/libunicode/utf8.h b/src/libunicode/utf8.h index c65743d..16898e7 100644 --- a/src/libunicode/utf8.h +++ b/src/libunicode/utf8.h @@ -109,52 +109,7 @@ struct Success using ConvertResult = std::variant; /// Progressively decodes a UTF-8 codepoint. -inline ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte) -{ - if (!_state.expectedLength) - { - if ((_byte & 0b1000'0000) == 0) - { - _state.currentLength = 1; - return Success { _byte }; - } - else if ((_byte & 0b1110'0000) == 0b1100'0000) - { - _state.currentLength = 1; - _state.expectedLength = 2; - _state.character = _byte & 0b0001'1111; - } - else if ((_byte & 0b1111'0000) == 0b1110'0000) - { - _state.currentLength = 1; - _state.expectedLength = 3; - _state.character = _byte & 0b0000'1111; - } - else if ((_byte & 0b1111'1000) == 0b1111'0000) - { - _state.currentLength = 1; - _state.expectedLength = 4; - _state.character = _byte & 0b0000'0111; - } - else - { - _state.currentLength = 1; - return Invalid {}; - } - } - else - { - _state.character <<= 6; - _state.character |= _byte & 0b0011'1111; - _state.currentLength++; - } - - if (_state.currentLength < _state.expectedLength) - return { Incomplete {} }; - - _state.expectedLength = 0; // reset state - return { Success { _state.character } }; -} +ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte); inline unsigned from_utf8i(utf8_decoder_state& _state, uint8_t _byte) { From bae06ae4efbe38e7d03ffa9b8202e656d63e7f66 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Fri, 14 Apr 2023 06:40:17 +0200 Subject: [PATCH 2/5] Apply some internal variable renames in `from_utf8(state, byte)` Signed-off-by: Christian Parpart --- src/libunicode/utf8.cpp | 48 ++++++++++++++++++++--------------------- src/libunicode/utf8.h | 2 +- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/libunicode/utf8.cpp b/src/libunicode/utf8.cpp index 5cadd96..fe37740 100644 --- a/src/libunicode/utf8.cpp +++ b/src/libunicode/utf8.cpp @@ -16,51 +16,51 @@ namespace unicode { -ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte) +ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) { - if (!_state.expectedLength) + if (!state.expectedLength) { - if ((_byte & 0b1000'0000) == 0) + if ((value & 0b1000'0000) == 0) { - _state.currentLength = 1; - return Success { _byte }; + state.currentLength = 1; + return Success { value }; } - else if ((_byte & 0b1110'0000) == 0b1100'0000) + else if ((value & 0b1110'0000) == 0b1100'0000) { - _state.currentLength = 1; - _state.expectedLength = 2; - _state.character = _byte & 0b0001'1111; + state.currentLength = 1; + state.expectedLength = 2; + state.character = value & 0b0001'1111; } - else if ((_byte & 0b1111'0000) == 0b1110'0000) + else if ((value & 0b1111'0000) == 0b1110'0000) { - _state.currentLength = 1; - _state.expectedLength = 3; - _state.character = _byte & 0b0000'1111; + state.currentLength = 1; + state.expectedLength = 3; + state.character = value & 0b0000'1111; } - else if ((_byte & 0b1111'1000) == 0b1111'0000) + else if ((value & 0b1111'1000) == 0b1111'0000) { - _state.currentLength = 1; - _state.expectedLength = 4; - _state.character = _byte & 0b0000'0111; + state.currentLength = 1; + state.expectedLength = 4; + state.character = value & 0b0000'0111; } else { - _state.currentLength = 1; + state.currentLength = 1; return Invalid {}; } } else { - _state.character <<= 6; - _state.character |= _byte & 0b0011'1111; - _state.currentLength++; + state.character <<= 6; + state.character |= value & 0b0011'1111; + state.currentLength++; } - if (_state.currentLength < _state.expectedLength) + if (state.currentLength < state.expectedLength) return { Incomplete {} }; - _state.expectedLength = 0; // reset state - return { Success { _state.character } }; + state.expectedLength = 0; // reset state + return { Success { state.character } }; } } // namespace unicode diff --git a/src/libunicode/utf8.h b/src/libunicode/utf8.h index 16898e7..d09ac3c 100644 --- a/src/libunicode/utf8.h +++ b/src/libunicode/utf8.h @@ -109,7 +109,7 @@ struct Success using ConvertResult = std::variant; /// Progressively decodes a UTF-8 codepoint. -ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte); +ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value); inline unsigned from_utf8i(utf8_decoder_state& _state, uint8_t _byte) { From 644028c1e19f5df66215fba8a56555769b95ffc5 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Fri, 14 Apr 2023 06:41:12 +0200 Subject: [PATCH 3/5] Mark `from_utf8(state, byte)` as `noexcept`. Signed-off-by: Christian Parpart --- src/libunicode/utf8.cpp | 2 +- src/libunicode/utf8.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libunicode/utf8.cpp b/src/libunicode/utf8.cpp index fe37740..40f81ef 100644 --- a/src/libunicode/utf8.cpp +++ b/src/libunicode/utf8.cpp @@ -16,7 +16,7 @@ namespace unicode { -ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) +ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept { if (!state.expectedLength) { diff --git a/src/libunicode/utf8.h b/src/libunicode/utf8.h index d09ac3c..8a1662f 100644 --- a/src/libunicode/utf8.h +++ b/src/libunicode/utf8.h @@ -109,7 +109,7 @@ struct Success using ConvertResult = std::variant; /// Progressively decodes a UTF-8 codepoint. -ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value); +ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept; inline unsigned from_utf8i(utf8_decoder_state& _state, uint8_t _byte) { From c0433805c3f6438383200cf31b87c9c041437e3b Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Fri, 14 Apr 2023 06:44:00 +0200 Subject: [PATCH 4/5] Fix UTF-8 decoding of incomplete UTF-8 multibyte sequences to properly report `Invalid` Signed-off-by: Christian Parpart --- Changelog.md | 2 +- src/libunicode/utf8.cpp | 14 +++++++ src/libunicode/utf8_test.cpp | 71 ++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/Changelog.md b/Changelog.md index 60cda3a..be62fb8 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,6 +1,6 @@ ## 0.3.1 (unreleased) -- ... +- Fix UTF-8 decoding of incomplete UTF-8 multibyte sequences to properly report `Invalid`. ## 0.3.0 (2023-03-01) diff --git a/src/libunicode/utf8.cpp b/src/libunicode/utf8.cpp index 40f81ef..082e607 100644 --- a/src/libunicode/utf8.cpp +++ b/src/libunicode/utf8.cpp @@ -46,9 +46,23 @@ ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept else { state.currentLength = 1; + state.expectedLength = 0; return Invalid {}; } } + // clang-format off + else if ((value & 0b1110'0000) == 0b1100'0000 + || (value & 0b1111'0000) == 0b1110'0000 + || (value & 0b1111'1000) == 0b1111'0000) + // clang-format on + { + // We have a new codepoint, but the previous one was incomplete. + state.expectedLength = 0; + // Return Invalid for the current incomplete codepoint, + // but have already started the next codepoint. + from_utf8(state, value); + return { Invalid {} }; + } else { state.character <<= 6; diff --git a/src/libunicode/utf8_test.cpp b/src/libunicode/utf8_test.cpp index c4a3715..ec3ff0b 100644 --- a/src/libunicode/utf8_test.cpp +++ b/src/libunicode/utf8_test.cpp @@ -146,6 +146,77 @@ TEST_CASE("utf8.from_utf8.invalid", "[utf8]") CHECK(a32 == U"HiHo"); } +TEST_CASE("utf8.from_utf8.incomplete.2", "[utf8]") +{ + // Ensure incomplete bytes are consumed and reported as Invalid accordingly. + auto state = utf8_decoder_state {}; + + // We start with an incomplete 2-byte sequence. + auto const r0 = from_utf8(state, 0xC7); + REQUIRE(holds_alternative(r0)); + + // Continue with another 2-byte sequence, + // while the first one is still incomplete. + auto const r1 = from_utf8(state, 0xC7); + REQUIRE(holds_alternative(r1)); + auto const r2 = from_utf8(state, 0x8E); + REQUIRE(holds_alternative(r2)); + REQUIRE((unsigned) get(r2).value == 0x01CE); +} + +TEST_CASE("utf8.from_utf8.incomplete.3", "[utf8]") +{ + // Ensure incomplete bytes are consumed and reported as Invalid accordingly. + auto state = utf8_decoder_state {}; + + // We start with an incomplete 2-byte sequence. + auto const r0 = from_utf8(state, 0xE2); + REQUIRE(holds_alternative(r0)); + auto const r1 = from_utf8(state, 0x82); + REQUIRE(holds_alternative(r1)); + + // Continue with another 2-byte sequence, + // while the first one is still incomplete. + auto const r2 = from_utf8(state, 0xE2); + REQUIRE(holds_alternative(r2)); + auto const r3 = from_utf8(state, 0x82); + REQUIRE(holds_alternative(r3)); + auto const r4 = from_utf8(state, 0xAC); + REQUIRE(holds_alternative(r4)); + REQUIRE((unsigned) get(r4).value == 0x20AC); +} + +TEST_CASE("utf8.from_utf8.incomplete.4", "[utf8]") +{ + auto constexpr sequence = "\xF0\x9F\x8D\xA3"sv; + auto constexpr codepoint = 0x1F363; + + auto state = utf8_decoder_state {}; + + // Generate an incomplete multi-byte sequence. + for (size_t i = 0; i < sequence.size() - 1; ++i) + { + CAPTURE(i, unsigned(sequence[i])); + auto const r = from_utf8(state, (uint8_t) sequence[i]); + REQUIRE(holds_alternative(r)); + } + + // Now fill the multi-byte sequence, but completely. + auto const r0 = from_utf8(state, (uint8_t) sequence[0]); + REQUIRE(holds_alternative(r0)); + + for (size_t i = 1; i < sequence.size() - 1; ++i) + { + CAPTURE(i, unsigned(sequence[i])); + auto const ri = from_utf8(state, (uint8_t) sequence[i]); + REQUIRE(holds_alternative(ri)); + } + + auto const last = from_utf8(state, (uint8_t) sequence.back()); + REQUIRE(holds_alternative(last)); + REQUIRE(get(last).value == codepoint); +} + TEST_CASE("utf8.iter", "[utf8]") { auto constexpr values = string_view { From a0c98926f58473b55746f0814ce82350283c4b74 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Fri, 14 Apr 2023 06:44:43 +0200 Subject: [PATCH 5/5] Change signature of `inline from_utf8(string_view const&)` slightly by dropping its cref Signed-off-by: Christian Parpart --- Changelog.md | 1 + src/libunicode/utf8.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Changelog.md b/Changelog.md index be62fb8..68ee64e 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,6 +1,7 @@ ## 0.3.1 (unreleased) - Fix UTF-8 decoding of incomplete UTF-8 multibyte sequences to properly report `Invalid`. +- Change signature of `inline from_utf8(string_view const&)` slightly by dropping its cref. ## 0.3.0 (2023-03-01) diff --git a/src/libunicode/utf8.h b/src/libunicode/utf8.h index 8a1662f..a49aa5b 100644 --- a/src/libunicode/utf8.h +++ b/src/libunicode/utf8.h @@ -152,7 +152,7 @@ inline ConvertResult from_utf8(char const* _bytes, size_t* _size) } template -inline std::basic_string from_utf8(std::string_view const& _bytes) +inline std::basic_string from_utf8(std::string_view _bytes) { static_assert(sizeof(T) == 4); std::basic_string s;