From b3c1f882f0287e049ac3a83de2decd8c9b959c85 Mon Sep 17 00:00:00 2001
From: Christian Parpart <christian@parpart.family>
Date: Fri, 14 Apr 2023 06:34:41 +0200
Subject: [PATCH 1/5] Move `from_utf8(state, byte)` into .cpp file

Signed-off-by: Christian Parpart <christian@parpart.family>
---
 src/libunicode/CMakeLists.txt |  1 +
 src/libunicode/utf8.cpp       | 66 +++++++++++++++++++++++++++++++++++
 src/libunicode/utf8.h         | 47 +------------------------
 3 files changed, 68 insertions(+), 46 deletions(-)
 create mode 100644 src/libunicode/utf8.cpp

diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt
index da490c7..2b9ed3f 100644
--- a/src/libunicode/CMakeLists.txt
+++ b/src/libunicode/CMakeLists.txt
@@ -89,6 +89,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
     grapheme_segmenter.cpp
     scan.cpp
     script_segmenter.cpp
+    utf8.cpp
     width.cpp
 
     # auto-generated by unicode_tablgen
diff --git a/src/libunicode/utf8.cpp b/src/libunicode/utf8.cpp
new file mode 100644
index 0000000..5cadd96
--- /dev/null
+++ b/src/libunicode/utf8.cpp
@@ -0,0 +1,66 @@
+/**
+ * This file is part of the "libunicode" project
+ *   Copyright (c) 2020 Christian Parpart <christian@parpart.family>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <libunicode/utf8.h>
+
+namespace unicode
+{
+
+ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte)
+{
+    if (!_state.expectedLength)
+    {
+        if ((_byte & 0b1000'0000) == 0)
+        {
+            _state.currentLength = 1;
+            return Success { _byte };
+        }
+        else if ((_byte & 0b1110'0000) == 0b1100'0000)
+        {
+            _state.currentLength = 1;
+            _state.expectedLength = 2;
+            _state.character = _byte & 0b0001'1111;
+        }
+        else if ((_byte & 0b1111'0000) == 0b1110'0000)
+        {
+            _state.currentLength = 1;
+            _state.expectedLength = 3;
+            _state.character = _byte & 0b0000'1111;
+        }
+        else if ((_byte & 0b1111'1000) == 0b1111'0000)
+        {
+            _state.currentLength = 1;
+            _state.expectedLength = 4;
+            _state.character = _byte & 0b0000'0111;
+        }
+        else
+        {
+            _state.currentLength = 1;
+            return Invalid {};
+        }
+    }
+    else
+    {
+        _state.character <<= 6;
+        _state.character |= _byte & 0b0011'1111;
+        _state.currentLength++;
+    }
+
+    if (_state.currentLength < _state.expectedLength)
+        return { Incomplete {} };
+
+    _state.expectedLength = 0; // reset state
+    return { Success { _state.character } };
+}
+
+} // namespace unicode
diff --git a/src/libunicode/utf8.h b/src/libunicode/utf8.h
index c65743d..16898e7 100644
--- a/src/libunicode/utf8.h
+++ b/src/libunicode/utf8.h
@@ -109,52 +109,7 @@ struct Success
 using ConvertResult = std::variant<Invalid, Incomplete, Success>;
 
 /// Progressively decodes a UTF-8 codepoint.
-inline ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte)
-{
-    if (!_state.expectedLength)
-    {
-        if ((_byte & 0b1000'0000) == 0)
-        {
-            _state.currentLength = 1;
-            return Success { _byte };
-        }
-        else if ((_byte & 0b1110'0000) == 0b1100'0000)
-        {
-            _state.currentLength = 1;
-            _state.expectedLength = 2;
-            _state.character = _byte & 0b0001'1111;
-        }
-        else if ((_byte & 0b1111'0000) == 0b1110'0000)
-        {
-            _state.currentLength = 1;
-            _state.expectedLength = 3;
-            _state.character = _byte & 0b0000'1111;
-        }
-        else if ((_byte & 0b1111'1000) == 0b1111'0000)
-        {
-            _state.currentLength = 1;
-            _state.expectedLength = 4;
-            _state.character = _byte & 0b0000'0111;
-        }
-        else
-        {
-            _state.currentLength = 1;
-            return Invalid {};
-        }
-    }
-    else
-    {
-        _state.character <<= 6;
-        _state.character |= _byte & 0b0011'1111;
-        _state.currentLength++;
-    }
-
-    if (_state.currentLength < _state.expectedLength)
-        return { Incomplete {} };
-
-    _state.expectedLength = 0; // reset state
-    return { Success { _state.character } };
-}
+ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte);
 
 inline unsigned from_utf8i(utf8_decoder_state& _state, uint8_t _byte)
 {

From bae06ae4efbe38e7d03ffa9b8202e656d63e7f66 Mon Sep 17 00:00:00 2001
From: Christian Parpart <christian@parpart.family>
Date: Fri, 14 Apr 2023 06:40:17 +0200
Subject: [PATCH 2/5] Apply some internal variable renames in `from_utf8(state,
 byte)`

Signed-off-by: Christian Parpart <christian@parpart.family>
---
 src/libunicode/utf8.cpp | 48 ++++++++++++++++++++---------------------
 src/libunicode/utf8.h   |  2 +-
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/libunicode/utf8.cpp b/src/libunicode/utf8.cpp
index 5cadd96..fe37740 100644
--- a/src/libunicode/utf8.cpp
+++ b/src/libunicode/utf8.cpp
@@ -16,51 +16,51 @@
 namespace unicode
 {
 
-ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte)
+ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value)
 {
-    if (!_state.expectedLength)
+    if (!state.expectedLength)
     {
-        if ((_byte & 0b1000'0000) == 0)
+        if ((value & 0b1000'0000) == 0)
         {
-            _state.currentLength = 1;
-            return Success { _byte };
+            state.currentLength = 1;
+            return Success { value };
         }
-        else if ((_byte & 0b1110'0000) == 0b1100'0000)
+        else if ((value & 0b1110'0000) == 0b1100'0000)
         {
-            _state.currentLength = 1;
-            _state.expectedLength = 2;
-            _state.character = _byte & 0b0001'1111;
+            state.currentLength = 1;
+            state.expectedLength = 2;
+            state.character = value & 0b0001'1111;
         }
-        else if ((_byte & 0b1111'0000) == 0b1110'0000)
+        else if ((value & 0b1111'0000) == 0b1110'0000)
         {
-            _state.currentLength = 1;
-            _state.expectedLength = 3;
-            _state.character = _byte & 0b0000'1111;
+            state.currentLength = 1;
+            state.expectedLength = 3;
+            state.character = value & 0b0000'1111;
         }
-        else if ((_byte & 0b1111'1000) == 0b1111'0000)
+        else if ((value & 0b1111'1000) == 0b1111'0000)
         {
-            _state.currentLength = 1;
-            _state.expectedLength = 4;
-            _state.character = _byte & 0b0000'0111;
+            state.currentLength = 1;
+            state.expectedLength = 4;
+            state.character = value & 0b0000'0111;
         }
         else
         {
-            _state.currentLength = 1;
+            state.currentLength = 1;
             return Invalid {};
         }
     }
     else
     {
-        _state.character <<= 6;
-        _state.character |= _byte & 0b0011'1111;
-        _state.currentLength++;
+        state.character <<= 6;
+        state.character |= value & 0b0011'1111;
+        state.currentLength++;
     }
 
-    if (_state.currentLength < _state.expectedLength)
+    if (state.currentLength < state.expectedLength)
         return { Incomplete {} };
 
-    _state.expectedLength = 0; // reset state
-    return { Success { _state.character } };
+    state.expectedLength = 0; // reset state
+    return { Success { state.character } };
 }
 
 } // namespace unicode
diff --git a/src/libunicode/utf8.h b/src/libunicode/utf8.h
index 16898e7..d09ac3c 100644
--- a/src/libunicode/utf8.h
+++ b/src/libunicode/utf8.h
@@ -109,7 +109,7 @@ struct Success
 using ConvertResult = std::variant<Invalid, Incomplete, Success>;
 
 /// Progressively decodes a UTF-8 codepoint.
-ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte);
+ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value);
 
 inline unsigned from_utf8i(utf8_decoder_state& _state, uint8_t _byte)
 {

From 644028c1e19f5df66215fba8a56555769b95ffc5 Mon Sep 17 00:00:00 2001
From: Christian Parpart <christian@parpart.family>
Date: Fri, 14 Apr 2023 06:41:12 +0200
Subject: [PATCH 3/5] Mark `from_utf8(state, byte)` as `noexcept`.

Signed-off-by: Christian Parpart <christian@parpart.family>
---
 src/libunicode/utf8.cpp | 2 +-
 src/libunicode/utf8.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libunicode/utf8.cpp b/src/libunicode/utf8.cpp
index fe37740..40f81ef 100644
--- a/src/libunicode/utf8.cpp
+++ b/src/libunicode/utf8.cpp
@@ -16,7 +16,7 @@
 namespace unicode
 {
 
-ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value)
+ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept
 {
     if (!state.expectedLength)
     {
diff --git a/src/libunicode/utf8.h b/src/libunicode/utf8.h
index d09ac3c..8a1662f 100644
--- a/src/libunicode/utf8.h
+++ b/src/libunicode/utf8.h
@@ -109,7 +109,7 @@ struct Success
 using ConvertResult = std::variant<Invalid, Incomplete, Success>;
 
 /// Progressively decodes a UTF-8 codepoint.
-ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value);
+ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept;
 
 inline unsigned from_utf8i(utf8_decoder_state& _state, uint8_t _byte)
 {

From c0433805c3f6438383200cf31b87c9c041437e3b Mon Sep 17 00:00:00 2001
From: Christian Parpart <christian@parpart.family>
Date: Fri, 14 Apr 2023 06:44:00 +0200
Subject: [PATCH 4/5] Fix UTF-8 decoding of incomplete UTF-8 multibyte
 sequences to properly report `Invalid`

Signed-off-by: Christian Parpart <christian@parpart.family>
---
 Changelog.md                 |  2 +-
 src/libunicode/utf8.cpp      | 14 +++++++
 src/libunicode/utf8_test.cpp | 71 ++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/Changelog.md b/Changelog.md
index 60cda3a..be62fb8 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -1,6 +1,6 @@
 ## 0.3.1 (unreleased)
 
-- ...
+- Fix UTF-8 decoding of incomplete UTF-8 multibyte sequences to properly report `Invalid`.
 
 ## 0.3.0 (2023-03-01)
 
diff --git a/src/libunicode/utf8.cpp b/src/libunicode/utf8.cpp
index 40f81ef..082e607 100644
--- a/src/libunicode/utf8.cpp
+++ b/src/libunicode/utf8.cpp
@@ -46,9 +46,23 @@ ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept
         else
         {
             state.currentLength = 1;
+            state.expectedLength = 0;
             return Invalid {};
         }
     }
+    // clang-format off
+    else if ((value & 0b1110'0000) == 0b1100'0000
+          || (value & 0b1111'0000) == 0b1110'0000
+          || (value & 0b1111'1000) == 0b1111'0000)
+    // clang-format on
+    {
+        // We have a new codepoint, but the previous one was incomplete.
+        state.expectedLength = 0;
+        // Return Invalid for the current incomplete codepoint,
+        // but have already started the next codepoint.
+        from_utf8(state, value);
+        return { Invalid {} };
+    }
     else
     {
         state.character <<= 6;
diff --git a/src/libunicode/utf8_test.cpp b/src/libunicode/utf8_test.cpp
index c4a3715..ec3ff0b 100644
--- a/src/libunicode/utf8_test.cpp
+++ b/src/libunicode/utf8_test.cpp
@@ -146,6 +146,77 @@ TEST_CASE("utf8.from_utf8.invalid", "[utf8]")
     CHECK(a32 == U"HiHo");
 }
 
+TEST_CASE("utf8.from_utf8.incomplete.2", "[utf8]")
+{
+    // Ensure incomplete bytes are consumed and reported as Invalid accordingly.
+    auto state = utf8_decoder_state {};
+
+    // We start with an incomplete 2-byte sequence.
+    auto const r0 = from_utf8(state, 0xC7);
+    REQUIRE(holds_alternative<Incomplete>(r0));
+
+    // Continue with another 2-byte sequence,
+    // while the first one is still incomplete.
+    auto const r1 = from_utf8(state, 0xC7);
+    REQUIRE(holds_alternative<Invalid>(r1));
+    auto const r2 = from_utf8(state, 0x8E);
+    REQUIRE(holds_alternative<Success>(r2));
+    REQUIRE((unsigned) get<Success>(r2).value == 0x01CE);
+}
+
+TEST_CASE("utf8.from_utf8.incomplete.3", "[utf8]")
+{
+    // Ensure incomplete bytes are consumed and reported as Invalid accordingly.
+    auto state = utf8_decoder_state {};
+
+    // We start with an incomplete 2-byte sequence.
+    auto const r0 = from_utf8(state, 0xE2);
+    REQUIRE(holds_alternative<Incomplete>(r0));
+    auto const r1 = from_utf8(state, 0x82);
+    REQUIRE(holds_alternative<Incomplete>(r1));
+
+    // Continue with another 2-byte sequence,
+    // while the first one is still incomplete.
+    auto const r2 = from_utf8(state, 0xE2);
+    REQUIRE(holds_alternative<Invalid>(r2));
+    auto const r3 = from_utf8(state, 0x82);
+    REQUIRE(holds_alternative<Incomplete>(r3));
+    auto const r4 = from_utf8(state, 0xAC);
+    REQUIRE(holds_alternative<Success>(r4));
+    REQUIRE((unsigned) get<Success>(r4).value == 0x20AC);
+}
+
+TEST_CASE("utf8.from_utf8.incomplete.4", "[utf8]")
+{
+    auto constexpr sequence = "\xF0\x9F\x8D\xA3"sv;
+    auto constexpr codepoint = 0x1F363;
+
+    auto state = utf8_decoder_state {};
+
+    // Generate an incomplete multi-byte sequence.
+    for (size_t i = 0; i < sequence.size() - 1; ++i)
+    {
+        CAPTURE(i, unsigned(sequence[i]));
+        auto const r = from_utf8(state, (uint8_t) sequence[i]);
+        REQUIRE(holds_alternative<Incomplete>(r));
+    }
+
+    // Now fill the multi-byte sequence, but completely.
+    auto const r0 = from_utf8(state, (uint8_t) sequence[0]);
+    REQUIRE(holds_alternative<Invalid>(r0));
+
+    for (size_t i = 1; i < sequence.size() - 1; ++i)
+    {
+        CAPTURE(i, unsigned(sequence[i]));
+        auto const ri = from_utf8(state, (uint8_t) sequence[i]);
+        REQUIRE(holds_alternative<Incomplete>(ri));
+    }
+
+    auto const last = from_utf8(state, (uint8_t) sequence.back());
+    REQUIRE(holds_alternative<Success>(last));
+    REQUIRE(get<Success>(last).value == codepoint);
+}
+
 TEST_CASE("utf8.iter", "[utf8]")
 {
     auto constexpr values = string_view {

From a0c98926f58473b55746f0814ce82350283c4b74 Mon Sep 17 00:00:00 2001
From: Christian Parpart <christian@parpart.family>
Date: Fri, 14 Apr 2023 06:44:43 +0200
Subject: [PATCH 5/5] Change signature of `inline from_utf8(string_view
 const&)` slightly by dropping its cref

Signed-off-by: Christian Parpart <christian@parpart.family>
---
 Changelog.md          | 1 +
 src/libunicode/utf8.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Changelog.md b/Changelog.md
index be62fb8..68ee64e 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -1,6 +1,7 @@
 ## 0.3.1 (unreleased)
 
 - Fix UTF-8 decoding of incomplete UTF-8 multibyte sequences to properly report `Invalid`.
+- Change signature of `inline from_utf8(string_view const&)` slightly by dropping its cref.
 
 ## 0.3.0 (2023-03-01)
 
diff --git a/src/libunicode/utf8.h b/src/libunicode/utf8.h
index 8a1662f..a49aa5b 100644
--- a/src/libunicode/utf8.h
+++ b/src/libunicode/utf8.h
@@ -152,7 +152,7 @@ inline ConvertResult from_utf8(char const* _bytes, size_t* _size)
 }
 
 template <typename T = char32_t>
-inline std::basic_string<T> from_utf8(std::string_view const& _bytes)
+inline std::basic_string<T> from_utf8(std::string_view _bytes)
 {
     static_assert(sizeof(T) == 4);
     std::basic_string<T> s;