Skip to content

Commit

Permalink
Merge pull request #71 from contour-terminal/fix/utf8-decode
Browse files Browse the repository at this point in the history
Fix error handling of UTF-8 decoding when incomplete UTF-8 sequences are processed
  • Loading branch information
christianparpart authored Apr 14, 2023
2 parents 65e0c6d + a0c9892 commit ad9398a
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 48 deletions.
3 changes: 2 additions & 1 deletion Changelog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## 0.3.1 (unreleased)

- ...
- Fix UTF-8 decoding of incomplete UTF-8 multibyte sequences to properly report `Invalid`.
- Change signature of `inline from_utf8(string_view const&)` slightly by dropping its cref.

## 0.3.0 (2023-03-01)

Expand Down
1 change: 1 addition & 0 deletions src/libunicode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
grapheme_segmenter.cpp
scan.cpp
script_segmenter.cpp
utf8.cpp
width.cpp

# auto-generated by unicode_tablgen
Expand Down
80 changes: 80 additions & 0 deletions src/libunicode/utf8.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/**
* This file is part of the "libunicode" project
* Copyright (c) 2020 Christian Parpart <[email protected]>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <libunicode/utf8.h>

namespace unicode
{

ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept
{
if (!state.expectedLength)
{
if ((value & 0b1000'0000) == 0)
{
state.currentLength = 1;
return Success { value };
}
else if ((value & 0b1110'0000) == 0b1100'0000)
{
state.currentLength = 1;
state.expectedLength = 2;
state.character = value & 0b0001'1111;
}
else if ((value & 0b1111'0000) == 0b1110'0000)
{
state.currentLength = 1;
state.expectedLength = 3;
state.character = value & 0b0000'1111;
}
else if ((value & 0b1111'1000) == 0b1111'0000)
{
state.currentLength = 1;
state.expectedLength = 4;
state.character = value & 0b0000'0111;
}
else
{
state.currentLength = 1;
state.expectedLength = 0;
return Invalid {};
}
}
// clang-format off
else if ((value & 0b1110'0000) == 0b1100'0000
|| (value & 0b1111'0000) == 0b1110'0000
|| (value & 0b1111'1000) == 0b1111'0000)
// clang-format on
{
// We have a new codepoint, but the previous one was incomplete.
state.expectedLength = 0;
// Return Invalid for the current incomplete codepoint,
// but have already started the next codepoint.
from_utf8(state, value);
return { Invalid {} };
}
else
{
state.character <<= 6;
state.character |= value & 0b0011'1111;
state.currentLength++;
}

if (state.currentLength < state.expectedLength)
return { Incomplete {} };

state.expectedLength = 0; // reset state
return { Success { state.character } };
}

} // namespace unicode
49 changes: 2 additions & 47 deletions src/libunicode/utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,52 +109,7 @@ struct Success
using ConvertResult = std::variant<Invalid, Incomplete, Success>;

/// Progressively decodes a UTF-8 codepoint.
inline ConvertResult from_utf8(utf8_decoder_state& _state, uint8_t _byte)
{
if (!_state.expectedLength)
{
if ((_byte & 0b1000'0000) == 0)
{
_state.currentLength = 1;
return Success { _byte };
}
else if ((_byte & 0b1110'0000) == 0b1100'0000)
{
_state.currentLength = 1;
_state.expectedLength = 2;
_state.character = _byte & 0b0001'1111;
}
else if ((_byte & 0b1111'0000) == 0b1110'0000)
{
_state.currentLength = 1;
_state.expectedLength = 3;
_state.character = _byte & 0b0000'1111;
}
else if ((_byte & 0b1111'1000) == 0b1111'0000)
{
_state.currentLength = 1;
_state.expectedLength = 4;
_state.character = _byte & 0b0000'0111;
}
else
{
_state.currentLength = 1;
return Invalid {};
}
}
else
{
_state.character <<= 6;
_state.character |= _byte & 0b0011'1111;
_state.currentLength++;
}

if (_state.currentLength < _state.expectedLength)
return { Incomplete {} };

_state.expectedLength = 0; // reset state
return { Success { _state.character } };
}
ConvertResult from_utf8(utf8_decoder_state& state, uint8_t value) noexcept;

inline unsigned from_utf8i(utf8_decoder_state& _state, uint8_t _byte)
{
Expand Down Expand Up @@ -197,7 +152,7 @@ inline ConvertResult from_utf8(char const* _bytes, size_t* _size)
}

template <typename T = char32_t>
inline std::basic_string<T> from_utf8(std::string_view const& _bytes)
inline std::basic_string<T> from_utf8(std::string_view _bytes)
{
static_assert(sizeof(T) == 4);
std::basic_string<T> s;
Expand Down
71 changes: 71 additions & 0 deletions src/libunicode/utf8_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,77 @@ TEST_CASE("utf8.from_utf8.invalid", "[utf8]")
CHECK(a32 == U"HiHo");
}

TEST_CASE("utf8.from_utf8.incomplete.2", "[utf8]")
{
// Ensure incomplete bytes are consumed and reported as Invalid accordingly.
auto state = utf8_decoder_state {};

// We start with an incomplete 2-byte sequence.
auto const r0 = from_utf8(state, 0xC7);
REQUIRE(holds_alternative<Incomplete>(r0));

// Continue with another 2-byte sequence,
// while the first one is still incomplete.
auto const r1 = from_utf8(state, 0xC7);
REQUIRE(holds_alternative<Invalid>(r1));
auto const r2 = from_utf8(state, 0x8E);
REQUIRE(holds_alternative<Success>(r2));
REQUIRE((unsigned) get<Success>(r2).value == 0x01CE);
}

TEST_CASE("utf8.from_utf8.incomplete.3", "[utf8]")
{
// Ensure incomplete bytes are consumed and reported as Invalid accordingly.
auto state = utf8_decoder_state {};

// We start with an incomplete 2-byte sequence.
auto const r0 = from_utf8(state, 0xE2);
REQUIRE(holds_alternative<Incomplete>(r0));
auto const r1 = from_utf8(state, 0x82);
REQUIRE(holds_alternative<Incomplete>(r1));

// Continue with another 2-byte sequence,
// while the first one is still incomplete.
auto const r2 = from_utf8(state, 0xE2);
REQUIRE(holds_alternative<Invalid>(r2));
auto const r3 = from_utf8(state, 0x82);
REQUIRE(holds_alternative<Incomplete>(r3));
auto const r4 = from_utf8(state, 0xAC);
REQUIRE(holds_alternative<Success>(r4));
REQUIRE((unsigned) get<Success>(r4).value == 0x20AC);
}

TEST_CASE("utf8.from_utf8.incomplete.4", "[utf8]")
{
auto constexpr sequence = "\xF0\x9F\x8D\xA3"sv;
auto constexpr codepoint = 0x1F363;

auto state = utf8_decoder_state {};

// Generate an incomplete multi-byte sequence.
for (size_t i = 0; i < sequence.size() - 1; ++i)
{
CAPTURE(i, unsigned(sequence[i]));
auto const r = from_utf8(state, (uint8_t) sequence[i]);
REQUIRE(holds_alternative<Incomplete>(r));
}

// Now fill the multi-byte sequence, but completely.
auto const r0 = from_utf8(state, (uint8_t) sequence[0]);
REQUIRE(holds_alternative<Invalid>(r0));

for (size_t i = 1; i < sequence.size() - 1; ++i)
{
CAPTURE(i, unsigned(sequence[i]));
auto const ri = from_utf8(state, (uint8_t) sequence[i]);
REQUIRE(holds_alternative<Incomplete>(ri));
}

auto const last = from_utf8(state, (uint8_t) sequence.back());
REQUIRE(holds_alternative<Success>(last));
REQUIRE(get<Success>(last).value == codepoint);
}

TEST_CASE("utf8.iter", "[utf8]")
{
auto constexpr values = string_view {
Expand Down

0 comments on commit ad9398a

Please sign in to comment.