Skip to content

Commit

Permalink
Use end of chunk in complex unicode
Browse files Browse the repository at this point in the history
  • Loading branch information
Yaraslaut committed Jul 25, 2024
1 parent fbb4a9d commit 3b705a5
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 10 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ src/libunicode/ucd.h
src/libunicode/ucd_enums.h
src/libunicode/ucd_fmt.h
src/libunicode/ucd_ostream.h
/.cache/
65 changes: 57 additions & 8 deletions src/libunicode/grapheme_line_segmenter.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
#include <libunicode/grapheme_segmenter.h>
#include <libunicode/support.h>

#if defined(LIBUNICODE_TRACE)
#include <fmt/core.h>

#if 0 || defined(LIBUNICODE_TRACE)
#include <format>
#include <iostream>

Expand Down Expand Up @@ -457,9 +459,11 @@ namespace detail
//
// @returns a sequence of grapheme clusters up to maxWidth width.
template <OptionalGraphemeSegmentationListenerConcept EventHandlerT>
LIBUNICODE_INLINE auto process_only_complex_unicode(
EventHandlerT& eventHandler, unicode_process_state& state, char const* start, char const* end, unsigned maxWidth) noexcept
-> detail::unicode_process_result
LIBUNICODE_INLINE auto process_only_complex_unicode(EventHandlerT& eventHandler,
unicode_process_state& state,
char const* start,
char const* end,
unsigned maxWidth) noexcept -> detail::unicode_process_result
{
if (!state.utf8DecodeNext)
{
Expand Down Expand Up @@ -538,6 +542,10 @@ namespace detail
return make_scan_result(consumedWidths, state.currentClusterStart, StopCondition::EndOfWidth);
}
}
else
{
// Boundary of a grapheme not found yet.
}
}
else if (std::holds_alternative<unicode::Invalid>(result))
{
Expand Down Expand Up @@ -660,15 +668,22 @@ class grapheme_line_segmenter<OptionalEventListener>
return { .text = {}, .width = 0, .stop_condition = StopCondition::EndOfInput };

// Points to the beginning of a grapheme cluster.
char const* const resultStart = _complexUnicodeState.currentClusterStart;
char const* const resultStart = _complexUnicodeState.currentCodepointStart;
char const* const endAtMaxWidth = std::min(end(), next() + maxWidth);

LIBUNICODE_TRACE_SEGMENTER("resultStart: {}\n", (void*) resultStart);
// Total number of widths used in the current line.
unsigned processedTotalWidth = 0;

while (true)
{
switch (detail::make_state(next(), end(), processedTotalWidth, maxWidth))
auto const state = detail::make_state(next(), end(), processedTotalWidth, maxWidth);
LIBUNICODE_TRACE_SEGMENTER("currentClusterStart: {}, end: {} , processedTotalWidth: {}, state: {} \n",
(void*) _complexUnicodeState.currentClusterStart,
(void*) end(),
processedTotalWidth,
state);
switch (state)
{
case State::EndOfInput:
return { .text = { resultStart, _complexUnicodeState.currentClusterStart },
Expand Down Expand Up @@ -725,14 +740,24 @@ class grapheme_line_segmenter<OptionalEventListener>
std::string_view(start, chunk.end),
(long) std::distance(start, chunk.end),
chunk.totalWidth,
(int) chunk.stop_condition);
[](auto stop) {
switch (stop)
{
case StopCondition::UnexpectedInput: return "UnexpectedInput";
case StopCondition::EndOfInput: return "EndOfInput";
case StopCondition::EndOfWidth: return "EndOfWidth";
}
return "INVALID";
}(chunk.stop_condition));
processedTotalWidth += chunk.totalWidth;
assert(processedTotalWidth <= maxWidth);
if (chunk.stop_condition != StopCondition::UnexpectedInput)
{
// The most recent grapheme cluster does not fit into the current line or the input is exhausted.
return { .text = std::string_view { resultStart, _complexUnicodeState.currentClusterStart },
return { .text = std::string_view { resultStart, chunk.end },
.width = processedTotalWidth,
.stop_condition = chunk.stop_condition };
}
break;
}
}
Expand Down Expand Up @@ -828,4 +853,28 @@ inline std::ostream& operator<<(std::ostream& os, unicode::grapheme_segmentation
<< ", stop: " << value.stop_condition << "}";
}
} // namespace std

namespace fmt
{
template <>
struct formatter<unicode::detail::State>: formatter<std::string_view>
{
template <typename FormatContext>
auto format(unicode::detail::State const& value, FormatContext& ctx)
{
std::string_view name;
switch (value)
{
case unicode::detail::State::EndOfInput: name = "EndOfInput"; break;
case unicode::detail::State::EndOfWidth: name = "EndOfWidth"; break;
case unicode::detail::State::C0: name = "C0"; break;
case unicode::detail::State::ASCII: name = "ASCII"; break;
case unicode::detail::State::ComplexUnicode: name = "ComplexUnicode"; break;
}
return formatter<std::string_view>::format(name, ctx);
}
};

} // namespace fmt

// }}}
148 changes: 146 additions & 2 deletions src/libunicode/grapheme_line_segmenter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,19 @@
#include <catch2/catch_message.hpp>
#include <catch2/catch_test_macros.hpp>

#include <iostream>
#include <string_view>
#include <variant>

#if 0 || defined(LIBUNICODE_TRACE)
#include <format>
#include <iostream>

#define TRACE(...) std::cout << std::format(__VA_ARGS__)
#else
#define TRACE(...) ((void) 0)
#endif

using namespace std::string_view_literals;
using namespace std::string_literals;
using std::pair;
Expand Down Expand Up @@ -55,6 +65,31 @@ std::ostream& operator<<(std::ostream& os, expectation const& e)
} // namespace std
// }}}

namespace fmt
{

template <>
struct formatter<expectation>: formatter<std::string_view>
{
template <typename FormatContext>
auto format(expectation const& e, FormatContext& ctx) const
{
return format_to(ctx.out(), "{{ offset: {}, size: {}, width: {} }}", e.offset, e.size, e.width);
}
};

template <>
struct formatter<std::pair<unicode::StopCondition, unsigned>>: formatter<std::string_view>
{
template <typename FormatContext>
auto format(std::pair<unicode::StopCondition, unsigned> const& v, FormatContext& ctx) const
{
return format_to(ctx.out(), "{{{}, {}}}", v.first, v.second);
}
};

} // namespace fmt

// {{{ helpers
namespace
{
Expand Down Expand Up @@ -120,12 +155,34 @@ struct complex_unicode_sequence
return os << "{ value: \"" << e(seq.value) << "\", width: " << seq.width << " }";
}

using Record = std::variant<invalid_sequence, ascii_sequence, complex_unicode_sequence>;

} // namespace

namespace
namespace fmt
{
template <>
struct formatter<Record>: formatter<std::string_view>
{
template <typename FormatContext>
auto format(Record const& r, FormatContext& ctx) const
{
if (std::holds_alternative<invalid_sequence>(r))
return fmt::format_to(ctx.out(), "invalid_sequence {{ value: \"{}\" }}", std::get<invalid_sequence>(r).value);
else if (std::holds_alternative<ascii_sequence>(r))
return fmt::format_to(ctx.out(), "ascii_sequence {{ value: \"{}\" }}", std::get<ascii_sequence>(r).value);
else
return fmt::format_to(ctx.out(),
"complex_unicode_sequence {{ value: \"{}\", width: {} }}",
std::get<complex_unicode_sequence>(r).value,
std::get<complex_unicode_sequence>(r).width);
}
};

using Record = std::variant<invalid_sequence, ascii_sequence, complex_unicode_sequence>;
} // namespace fmt

namespace
{

auto constexpr FamilyEmoji = U"\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466"sv;
auto constexpr SmileyEmoji = U"\U0001F600"sv;
Expand Down Expand Up @@ -649,3 +706,90 @@ TEST_CASE("grapheme_line_segmenter.complex.sliced_calls")
CHECK(result2.stop_condition == StopCondition::UnexpectedInput); // control character \033
REQUIRE(e(result2.text) == e(u8(SmileyEmoji)));
}

TEST_CASE("grapheme_utf8.0")
{
auto constexpr text = "\xC3\xB6"sv; // 'ö'

const auto* input = text.data();
const auto* const end = text.data() + text.size();

auto recorder = event_recorder { "single_utf8" };
auto segmenter = grapheme_line_segmenter { recorder, ""sv };

auto const chunk = std::string_view(input, end);
TRACE("Processing {}...{} ({})\n", (void*) input, (void*) end, std::distance(input, end));
segmenter.reset(chunk);

auto const result = segmenter.process(10);
TRACE("result: [text: \"{}\", width: {}, stop: {}]\n", result.text, result.width, [](auto val) {
switch (val)
{
case unicode::StopCondition::UnexpectedInput: return "UnexpectedInput";
case unicode::StopCondition::EndOfWidth: return "EndOfWidth";
case unicode::StopCondition::EndOfInput: return "EndOfInput";
}
return "Unknown";
}(result.stop_condition));

CHECK(result.text == text);
CHECK(result.width == 0);
}

TEST_CASE("grapheme_utf8.1")
{
auto constexpr text = "\xC3\xB6 "sv; // 'ö '

const auto* input = text.data();
const auto* const end = text.data() + text.size();

auto recorder = event_recorder { "single_utf8" };
auto segmenter = grapheme_line_segmenter { recorder, ""sv };

auto const chunk = std::string_view(input, end);
TRACE("Processing {}...{} ({})\n", (void*) input, (void*) end, std::distance(input, end));
segmenter.reset(chunk);

auto const result = segmenter.process(10);
TRACE("result: [text: \"{}\", width: {}, stop: {}]\n", result.text, result.width, [](auto val) {
switch (val)
{
case unicode::StopCondition::UnexpectedInput: return "UnexpectedInput";
case unicode::StopCondition::EndOfWidth: return "EndOfWidth";
case unicode::StopCondition::EndOfInput: return "EndOfInput";
}
return "Unknown";
}(result.stop_condition));

CHECK(result.text == text);
CHECK(result.width == 2);
}

TEST_CASE("grapheme_utf8.2")
{
auto constexpr text = "a\xC3\xB6a"sv; // 'aöa'

const auto* input = text.data();
const auto* const end = text.data() + text.size();

auto recorder = event_recorder { "single_utf8" };
auto segmenter = grapheme_line_segmenter { recorder, ""sv };

auto const chunk = std::string_view(input, end);
TRACE("Processing {}...{} ({})\n", (void*) input, (void*) end, std::distance(input, end));
segmenter.reset(chunk);

auto const result = segmenter.process(10);
TRACE("result: [text: \"{}\", width: {}, stop: {}]\n", result.text, result.width, [](auto val) {
switch (val)
{
case unicode::StopCondition::UnexpectedInput: return "UnexpectedInput";
case unicode::StopCondition::EndOfWidth: return "EndOfWidth";
case unicode::StopCondition::EndOfInput: return "EndOfInput";
}
return "Unknown";
}(result.stop_condition));

CHECK(result.text == text);
CHECK(result.width == 3);
}

0 comments on commit 3b705a5

Please sign in to comment.