Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update supported Unicode to version 16.0.0 #107

Merged
merged 11 commits into from
Sep 30, 2024
Merged
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ src/libunicode/ucd.h
src/libunicode/ucd_enums.h
src/libunicode/ucd_fmt.h
src/libunicode/ucd_ostream.h
/src/libunicode/codepoint_properties_data.cpp
/src/libunicode/codepoint_properties_data.h
/src/libunicode/codepoint_properties_names.cpp
src/libunicode/codepoint_properties_data.cpp
src/libunicode/codepoint_properties_data.h
src/libunicode/codepoint_properties_names.cpp
24 changes: 24 additions & 0 deletions .vimspector.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"$schema": "https://puremourning.github.io/vimspector/schema/vimspector.schema.json#",
"configurations": {
"ModelTest": {
"adapter": "vscode-cpptools",
"configuration": {
"request": "launch",
"program": "${workspaceRoot}/build/linux-clang-debug/src/libunicode/unicode_test",
"args": [
],
"cwd": "${workspaceRoot}",
"externalConsole": true,
"stopAtEntry": false,
"MIMode": "gdb"
},
"breakpoints": {
"exception": {
"caught": "Y",
"uncaught": "Y"
}
}
}
}
}
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ if(LIBUNICODE_TESTING)
endif()

# ----------------------------------------------------------------------------
set(LIBUNICODE_UCD_VERSION "15.0.0" CACHE STRING "libunicode: Unicode version")
set(LIBUNICODE_UCD_VERSION "16.0.0" CACHE STRING "libunicode: Unicode version")
set(LIBUNICODE_UCD_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/_ucd" CACHE PATH "Path to directory for downloaded files & extracted directories.")

set(LIBUNICODE_UCD_ZIP_DOWNLOAD_URL "https://www.unicode.org/Public/${LIBUNICODE_UCD_VERSION}/ucd/UCD.zip")
set(LIBUNICODE_UCD_MD5 "8c66407dd8ce2d84278868a69ea83280")
set(LIBUNICODE_UCD_MD5 "bdd823cbd37c376633d6737a12281233")
set(LIBUNICODE_UCD_ZIP_FILE "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}.zip")
set(LIBUNICODE_UCD_DIR "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}" CACHE PATH "Path to UCD directory.")

Expand Down
1 change: 1 addition & 0 deletions src/libunicode/codepoint_properties_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ namespace
pair { "13.0"sv, Age::V13_0 },
pair { "14.0"sv, Age::V14_0 },
pair { "15.0"sv, Age::V15_0 },
pair { "16.0"sv, Age::V16_0 },
pair { "1.1"sv, Age::V1_1 },
pair { "2.0"sv, Age::V2_0 },
pair { "2.1"sv, Age::V2_1 },
Expand Down
6 changes: 2 additions & 4 deletions src/libunicode/run_segmenter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@

#include <catch2/catch_test_macros.hpp>

#include <array>
#include <format>
#include <ostream>
#include <sstream>
#include <string>
Expand Down Expand Up @@ -144,9 +142,9 @@ TEST_CASE("run_segmenter.JapaneseHindiEmojiMix", "[run_segmenter]")
{ U"🌱🌲", Script::Han, PresentationStyle::Emoji } });
}

TEST_CASE("run_segmenter.CombiningCirlce", "[run_segmenter]")
TEST_CASE("run_segmenter.CombiningCircle", "[run_segmenter]")
{
test_run_segmentation(__LINE__, { { U"◌́◌̀◌̈◌̂◌̄◌̊", Script::Common, PresentationStyle::Text } });
test_run_segmentation(__LINE__, { { U"◌́◌̀◌̈◌̂◌̄◌̊", Script::Latin, PresentationStyle::Text } });
}

TEST_CASE("run_segmenter.Arab_Hangul", "[run_segmenter]")
Expand Down
4 changes: 2 additions & 2 deletions src/libunicode/script_segmenter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ optional<script_segmenter::result> script_segmenter::consume()
return res;
}

bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet)
bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept
{
if (nextSet.empty() || currentSet.empty())
return false;
Expand Down Expand Up @@ -122,7 +122,7 @@ bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet
return true;
}

script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint)
script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint) noexcept
{
ScriptSet scriptSet;

Expand Down
5 changes: 2 additions & 3 deletions src/libunicode/script_segmenter.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

#include <optional>
#include <string_view>
#include <vector>

namespace unicode
{
Expand Down Expand Up @@ -81,13 +80,13 @@ class script_segmenter
}

/// Returnes all scripts that this @p _codepoint is associated with.
ScriptSet getScriptsFor(char32_t codepoint);
ScriptSet getScriptsFor(char32_t codepoint) noexcept;

/// Intersects @p _nextSet into @p _currentSet.
///
/// @retval true Intersection succeed, meaning that no boundary was found.
/// @retval false The resulting intersection is empty, meaning, a script boundary was found.
bool mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet);
bool mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept;

/// Returns the resolved script.
///
Expand Down
27 changes: 20 additions & 7 deletions src/libunicode/script_segmenter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,9 @@

#include <catch2/catch_test_macros.hpp>

#include <string>
#include <string_view>

using namespace std::string_view_literals;
using namespace std::string_view_literals;
using std::optional;
using unicode::script_segmenter;

TEST_CASE("script_segmenter.private_use_area", "[script_segmenter]")
{
Expand All @@ -35,15 +31,32 @@ TEST_CASE("script_segmenter.private_use_area", "[script_segmenter]")
CHECK(res1.script == unicode::Script::Unknown);
}

TEST_CASE("script_segmenter.common_to_specific", "[script_segmenter]")
{
// '1' is script property Common, 'a' is script property Latin, so the whole string is Latin.

auto constexpr str = U"1a"sv;
auto seg = unicode::script_segmenter { str.data(), str.size() };

std::optional<unicode::script_segmenter::result> const r1 = seg.consume();
REQUIRE(r1.has_value());
auto const res1 = r1.value();
CHECK(res1.size == str.size());
CHECK(res1.script == unicode::Script::Latin);

auto const r2 = seg.consume();
REQUIRE_FALSE(r2.has_value());
}

TEST_CASE("script_segmenter.greek_kanji_greek", "[script_segmenter]")
{
char32_t const* str = U"λ 合気道 λ;";
auto seg = script_segmenter { str };
auto seg = unicode::script_segmenter { str };

// greek text
optional<script_segmenter::result> const r1 = seg.consume();
std::optional<unicode::script_segmenter::result> const r1 = seg.consume();
REQUIRE(r1.has_value());
script_segmenter::result const res1 = r1.value();
unicode::script_segmenter::result const res1 = r1.value();
CHECK(res1.size == 2);
CHECK(res1.script == unicode::Script::Greek);

Expand Down
10 changes: 9 additions & 1 deletion src/libunicode/ucd_private.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,20 @@ constexpr std::optional<T> search(std::array<Prop<T>, N> const& ranges, char32_t

while (a < b)
{
auto const i = static_cast<size_t>((b + a) / 2);
auto const i = a + static_cast<size_t>((b - a) / 2);
auto const& I = ranges[i];
if (I.interval.to < codepoint)
{
if (i == b)
return std::nullopt;
a = i + 1;
}
else if (I.interval.from > codepoint)
{
if (i == 0)
return std::nullopt;
b = i - 1;
}
else
return I.property;
}
Expand Down
Loading