Merge pull request #71 from contour-terminal/fix/utf8-decode

Fix error handling of UTF-8 decoding when incomplete UTF-8 sequences are processed
contour-terminal · Apr 14, 2023 · ad9398a · ad9398a
2 parents 65e0c6d + a0c9892
commit ad9398a
Show file tree

Hide file tree

Showing 5 changed files with 156 additions and 48 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -1,6 +1,7 @@
 ## 0.3.1 (unreleased)
 
-- ...
+- Fix UTF-8 decoding of incomplete UTF-8 multibyte sequences to properly report `Invalid`.
+- Change signature of `inline from_utf8(string_view const&)` slightly by dropping its cref.
 
 ## 0.3.0 (2023-03-01)
 

diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt
@@ -89,6 +89,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
     grapheme_segmenter.cpp
     scan.cpp
     script_segmenter.cpp
+    utf8.cpp
     width.cpp
 
     # auto-generated by unicode_tablgen

diff --git a/src/libunicode/utf8.cpp b/src/libunicode/utf8.cpp
@@ -0,0 +1,80 @@
+/**
+ * This file is part of the "libunicode" project
+ *   Copyright (c) 2020 Christian Parpart <[email protected]>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <libunicode/utf8.h>
+
+namespace unicode
+{
+
+ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept
+{
+    if (!state.expectedLength)
+    {
+        if ((value & 0b1000'0000) == 0)
+        {
+            state.currentLength = 1;
+            return Success { value };
+        }
+        else if ((value & 0b1110'0000) == 0b1100'0000)
+        {
+            state.currentLength = 1;
+            state.expectedLength = 2;
+            state.character = value & 0b0001'1111;
+        }
+        else if ((value & 0b1111'0000) == 0b1110'0000)
+        {
+            state.currentLength = 1;
+            state.expectedLength = 3;
+            state.character = value & 0b0000'1111;
+        }
+        else if ((value & 0b1111'1000) == 0b1111'0000)
+        {
+            state.currentLength = 1;
+            state.expectedLength = 4;
+            state.character = value & 0b0000'0111;
+        }
+        else
+        {
+            state.currentLength = 1;
+            state.expectedLength = 0;
+            return Invalid {};
+        }
+    }
+    // clang-format off
+    else if ((value & 0b1110'0000) == 0b1100'0000
+          || (value & 0b1111'0000) == 0b1110'0000
+          || (value & 0b1111'1000) == 0b1111'0000)
+    // clang-format on
+    {
+        // We have a new codepoint, but the previous one was incomplete.
+        state.expectedLength = 0;
+        // Return Invalid for the current incomplete codepoint,
+        // but have already started the next codepoint.
+        from_utf8(state, value);
+        return { Invalid {} };
+    }
+    else
+    {
+        state.character <<= 6;
+        state.character |= value & 0b0011'1111;
+        state.currentLength++;
+    }
+
+    if (state.currentLength < state.expectedLength)
+        return { Incomplete {} };
+
+    state.expectedLength = 0; // reset state
+    return { Success { state.character } };
+}
+
+} // namespace unicode
diff --git a/src/libunicode/utf8.h b/src/libunicode/utf8.h
@@ -109,52 +109,7 @@ struct Success
 using ConvertResult = std::variant<Invalid, Incomplete, Success>;
 
 /// Progressively decodes a UTF-8 codepoint.
-inline ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte)
-{
-    if (!_state.expectedLength)
-    {
-        if ((_byte & 0b1000'0000) == 0)
-        {
-            _state.currentLength = 1;
-            return Success { _byte };
-        }
-        else if ((_byte & 0b1110'0000) == 0b1100'0000)
-        {
-            _state.currentLength = 1;
-            _state.expectedLength = 2;
-            _state.character = _byte & 0b0001'1111;
-        }
-        else if ((_byte & 0b1111'0000) == 0b1110'0000)
-        {
-            _state.currentLength = 1;
-            _state.expectedLength = 3;
-            _state.character = _byte & 0b0000'1111;
-        }
-        else if ((_byte & 0b1111'1000) == 0b1111'0000)
-        {
-            _state.currentLength = 1;
-            _state.expectedLength = 4;
-            _state.character = _byte & 0b0000'0111;
-        }
-        else
-        {
-            _state.currentLength = 1;
-            return Invalid {};
-        }
-    }
-    else
-    {
-        _state.character <<= 6;
-        _state.character |= _byte & 0b0011'1111;
-        _state.currentLength++;
-    }
-
-    if (_state.currentLength < _state.expectedLength)
-        return { Incomplete {} };
-
-    _state.expectedLength = 0; // reset state
-    return { Success { _state.character } };
-}
+ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept;
 
 inline unsigned from_utf8i(utf8_decoder_state& _state, uint8_t _byte)
 {
@@ -197,7 +152,7 @@ inline ConvertResult from_utf8(char const* _bytes, size_t* _size)
 }
 
 template <typename T = char32_t>
-inline std::basic_string<T> from_utf8(std::string_view const& _bytes)
+inline std::basic_string<T> from_utf8(std::string_view _bytes)
 {
     static_assert(sizeof(T) == 4);
     std::basic_string<T> s;

diff --git a/src/libunicode/utf8_test.cpp b/src/libunicode/utf8_test.cpp
@@ -146,6 +146,77 @@ TEST_CASE("utf8.from_utf8.invalid", "[utf8]")
     CHECK(a32 == U"HiHo");
 }
 
+TEST_CASE("utf8.from_utf8.incomplete.2", "[utf8]")
+{
+    // Ensure incomplete bytes are consumed and reported as Invalid accordingly.
+    auto state = utf8_decoder_state {};
+
+    // We start with an incomplete 2-byte sequence.
+    auto const r0 = from_utf8(state, 0xC7);
+    REQUIRE(holds_alternative<Incomplete>(r0));
+
+    // Continue with another 2-byte sequence,
+    // while the first one is still incomplete.
+    auto const r1 = from_utf8(state, 0xC7);
+    REQUIRE(holds_alternative<Invalid>(r1));
+    auto const r2 = from_utf8(state, 0x8E);
+    REQUIRE(holds_alternative<Success>(r2));
+    REQUIRE((unsigned) get<Success>(r2).value == 0x01CE);
+}
+
+TEST_CASE("utf8.from_utf8.incomplete.3", "[utf8]")
+{
+    // Ensure incomplete bytes are consumed and reported as Invalid accordingly.
+    auto state = utf8_decoder_state {};
+
+    // We start with an incomplete 2-byte sequence.
+    auto const r0 = from_utf8(state, 0xE2);
+    REQUIRE(holds_alternative<Incomplete>(r0));
+    auto const r1 = from_utf8(state, 0x82);
+    REQUIRE(holds_alternative<Incomplete>(r1));
+
+    // Continue with another 2-byte sequence,
+    // while the first one is still incomplete.
+    auto const r2 = from_utf8(state, 0xE2);
+    REQUIRE(holds_alternative<Invalid>(r2));
+    auto const r3 = from_utf8(state, 0x82);
+    REQUIRE(holds_alternative<Incomplete>(r3));
+    auto const r4 = from_utf8(state, 0xAC);
+    REQUIRE(holds_alternative<Success>(r4));
+    REQUIRE((unsigned) get<Success>(r4).value == 0x20AC);
+}
+
+TEST_CASE("utf8.from_utf8.incomplete.4", "[utf8]")
+{
+    auto constexpr sequence = "\xF0\x9F\x8D\xA3"sv;
+    auto constexpr codepoint = 0x1F363;
+
+    auto state = utf8_decoder_state {};
+
+    // Generate an incomplete multi-byte sequence.
+    for (size_t i = 0; i < sequence.size() - 1; ++i)
+    {
+        CAPTURE(i, unsigned(sequence[i]));
+        auto const r = from_utf8(state, (uint8_t) sequence[i]);
+        REQUIRE(holds_alternative<Incomplete>(r));
+    }
+
+    // Now fill the multi-byte sequence, but completely.
+    auto const r0 = from_utf8(state, (uint8_t) sequence[0]);
+    REQUIRE(holds_alternative<Invalid>(r0));
+
+    for (size_t i = 1; i < sequence.size() - 1; ++i)
+    {
+        CAPTURE(i, unsigned(sequence[i]));
+        auto const ri = from_utf8(state, (uint8_t) sequence[i]);
+        REQUIRE(holds_alternative<Incomplete>(ri));
+    }
+
+    auto const last = from_utf8(state, (uint8_t) sequence.back());
+    REQUIRE(holds_alternative<Success>(last));
+    REQUIRE(get<Success>(last).value == codepoint);
+}
+
 TEST_CASE("utf8.iter", "[utf8]")
 {
     auto constexpr values = string_view {