From 906227b9c6d01c8b8d4138e6348c2f3cb9438c0f Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Sat, 28 Sep 2024 17:12:30 +0200 Subject: [PATCH 01/11] Update .gitignore Signed-off-by: Christian Parpart --- .gitignore | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 809f855..8659b0a 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,6 @@ src/libunicode/ucd.h src/libunicode/ucd_enums.h src/libunicode/ucd_fmt.h src/libunicode/ucd_ostream.h -/src/libunicode/codepoint_properties_data.cpp -/src/libunicode/codepoint_properties_data.h -/src/libunicode/codepoint_properties_names.cpp +src/libunicode/codepoint_properties_data.cpp +src/libunicode/codepoint_properties_data.h +src/libunicode/codepoint_properties_names.cpp From 01f3483de503d375d71cba14a45ce32539361871 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Sat, 28 Sep 2024 17:13:16 +0200 Subject: [PATCH 02/11] Add .vimspector.json for easier interactive debugging in VIM Signed-off-by: Christian Parpart --- .vimspector.json | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .vimspector.json diff --git a/.vimspector.json b/.vimspector.json new file mode 100644 index 0000000..41f3cd3 --- /dev/null +++ b/.vimspector.json @@ -0,0 +1,25 @@ +{ + "$schema": "https://puremourning.github.io/vimspector/schema/vimspector.schema.json#", + "configurations": { + "ModelTest": { + "adapter": "vscode-cpptools", + "configuration": { + "request": "launch", + "program": "${workspaceRoot}/build/linux-clang-debug/src/libunicode/unicode_test", + "args": [ + "run_segmenter.LatinEmoji" + ], + "cwd": "${workspaceRoot}", + "externalConsole": true, + "stopAtEntry": false, + "MIMode": "gdb" + }, + "breakpoints": { + "exception": { + "caught": "Y", + "uncaught": "Y" + } + } + } + } +} From 5a8c836e14b413fd944d7c96ee720b7a125c030c Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Sat, 28 Sep 2024 17:15:11 +0200 Subject: [PATCH 03/11] Update default UCD download URL to Unicode 16.0.0 Signed-off-by: Christian Parpart --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3963957..73fec0f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,11 +61,11 @@ if(LIBUNICODE_TESTING) endif() # ---------------------------------------------------------------------------- -set(LIBUNICODE_UCD_VERSION "15.0.0" CACHE STRING "libunicode: Unicode version") +set(LIBUNICODE_UCD_VERSION "16.0.0" CACHE STRING "libunicode: Unicode version") set(LIBUNICODE_UCD_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/_ucd" CACHE PATH "Path to directory for downloaded files & extracted directories.") set(LIBUNICODE_UCD_ZIP_DOWNLOAD_URL "https://www.unicode.org/Public/${LIBUNICODE_UCD_VERSION}/ucd/UCD.zip") -set(LIBUNICODE_UCD_MD5 "8c66407dd8ce2d84278868a69ea83280") +set(LIBUNICODE_UCD_MD5 "bdd823cbd37c376633d6737a12281233") set(LIBUNICODE_UCD_ZIP_FILE "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}.zip") set(LIBUNICODE_UCD_DIR "${LIBUNICODE_UCD_BASE_DIR}/ucd-${LIBUNICODE_UCD_VERSION}" CACHE PATH "Path to UCD directory.") From 924d08931ecfb96a1e2cd8c906bf580e84fab1cc Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Sat, 28 Sep 2024 17:15:41 +0200 Subject: [PATCH 04/11] Adapt loader to Unicode 16.0.0 Signed-off-by: Christian Parpart --- src/libunicode/codepoint_properties_loader.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libunicode/codepoint_properties_loader.cpp b/src/libunicode/codepoint_properties_loader.cpp index deb9666..7638f51 100644 --- a/src/libunicode/codepoint_properties_loader.cpp +++ b/src/libunicode/codepoint_properties_loader.cpp @@ -46,6 +46,7 @@ namespace pair { "13.0"sv, Age::V13_0 }, pair { "14.0"sv, Age::V14_0 }, pair { "15.0"sv, Age::V15_0 }, + pair { "16.0"sv, Age::V16_0 }, pair { "1.1"sv, Age::V1_1 }, pair { "2.0"sv, Age::V2_0 }, pair { "2.1"sv, Age::V2_1 }, From 285f88ce97d998f08a4685957bd68d27bc7bbb10 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Sat, 28 Sep 2024 17:59:24 +0200 Subject: [PATCH 05/11] [ucd_private] search: use safer mid-point calculation Signed-off-by: Christian Parpart --- src/libunicode/ucd_private.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libunicode/ucd_private.h b/src/libunicode/ucd_private.h index de23e71..c9ceca2 100644 --- a/src/libunicode/ucd_private.h +++ b/src/libunicode/ucd_private.h @@ -63,7 +63,7 @@ constexpr std::optional search(std::array, N> const& ranges, char32_t while (a < b) { - auto const i = static_cast((b + a) / 2); + auto const i = a + static_cast((b - a) / 2); auto const& I = ranges[i]; if (I.interval.to < codepoint) a = i + 1; From a7cffe914066fd3a161f6d4fedee426e1bbf6d58 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Sat, 28 Sep 2024 18:00:11 +0200 Subject: [PATCH 06/11] [ucd_private] search: guard against out-of-bounds (happened in Unicode 16.0.0 for ScriptExtensions) Signed-off-by: Christian Parpart --- src/libunicode/ucd_private.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/libunicode/ucd_private.h b/src/libunicode/ucd_private.h index c9ceca2..17fac8c 100644 --- a/src/libunicode/ucd_private.h +++ b/src/libunicode/ucd_private.h @@ -66,9 +66,17 @@ constexpr std::optional search(std::array, N> const& ranges, char32_t auto const i = a + static_cast((b - a) / 2); auto const& I = ranges[i]; if (I.interval.to < codepoint) + { + if (i == b) + return std::nullopt; a = i + 1; + } else if (I.interval.from > codepoint) + { + if (i == 0) + return std::nullopt; b = i - 1; + } else return I.property; } From 0cbbda5c45e70058e822d5f33b9d51e1980adc98 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Sat, 28 Sep 2024 18:27:32 +0200 Subject: [PATCH 07/11] squash with .vimspector Signed-off-by: Christian Parpart --- .vimspector.json | 1 - 1 file changed, 1 deletion(-) diff --git a/.vimspector.json b/.vimspector.json index 41f3cd3..48fcfd0 100644 --- a/.vimspector.json +++ b/.vimspector.json @@ -7,7 +7,6 @@ "request": "launch", "program": "${workspaceRoot}/build/linux-clang-debug/src/libunicode/unicode_test", "args": [ - "run_segmenter.LatinEmoji" ], "cwd": "${workspaceRoot}", "externalConsole": true, From 8fa58afe3808b362ad60da02b55c84c430cd952a Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Sat, 28 Sep 2024 18:33:09 +0200 Subject: [PATCH 08/11] run_segmenter_test: cleanups Signed-off-by: Christian Parpart --- src/libunicode/run_segmenter_test.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/libunicode/run_segmenter_test.cpp b/src/libunicode/run_segmenter_test.cpp index 0a83478..133b2fd 100644 --- a/src/libunicode/run_segmenter_test.cpp +++ b/src/libunicode/run_segmenter_test.cpp @@ -17,8 +17,6 @@ #include -#include -#include #include #include #include @@ -144,7 +142,7 @@ TEST_CASE("run_segmenter.JapaneseHindiEmojiMix", "[run_segmenter]") { U"🌱🌲", Script::Han, PresentationStyle::Emoji } }); } -TEST_CASE("run_segmenter.CombiningCirlce", "[run_segmenter]") +TEST_CASE("run_segmenter.CombiningCircle", "[run_segmenter]") { test_run_segmentation(__LINE__, { { U"◌́◌̀◌̈◌̂◌̄◌̊", Script::Common, PresentationStyle::Text } }); } From c277f274f4863c07817fec0fa1dd8a9243e77b4b Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Sat, 28 Sep 2024 18:33:52 +0200 Subject: [PATCH 09/11] script_segmenter: mark some more member functions noexcept Signed-off-by: Christian Parpart --- src/libunicode/script_segmenter.cpp | 4 ++-- src/libunicode/script_segmenter.h | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/libunicode/script_segmenter.cpp b/src/libunicode/script_segmenter.cpp index f2dde2c..3f202de 100644 --- a/src/libunicode/script_segmenter.cpp +++ b/src/libunicode/script_segmenter.cpp @@ -60,7 +60,7 @@ optional script_segmenter::consume() return res; } -bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) +bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept { if (nextSet.empty() || currentSet.empty()) return false; @@ -122,7 +122,7 @@ bool script_segmenter::mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet return true; } -script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint) +script_segmenter::ScriptSet script_segmenter::getScriptsFor(char32_t codepoint) noexcept { ScriptSet scriptSet; diff --git a/src/libunicode/script_segmenter.h b/src/libunicode/script_segmenter.h index ee1a547..b19d16f 100644 --- a/src/libunicode/script_segmenter.h +++ b/src/libunicode/script_segmenter.h @@ -18,7 +18,6 @@ #include #include -#include namespace unicode { @@ -81,13 +80,13 @@ class script_segmenter } /// Returnes all scripts that this @p _codepoint is associated with. - ScriptSet getScriptsFor(char32_t codepoint); + ScriptSet getScriptsFor(char32_t codepoint) noexcept; /// Intersects @p _nextSet into @p _currentSet. /// /// @retval true Intersection succeed, meaning that no boundary was found. /// @retval false The resulting intersection is empty, meaning, a script boundary was found. - bool mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet); + bool mergeSets(ScriptSet const& nextSet, ScriptSet& currentSet) noexcept; /// Returns the resolved script. /// From 431c9f76a1135f5eec5065220028dd5808a8464b Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Sat, 28 Sep 2024 18:34:14 +0200 Subject: [PATCH 10/11] script_segmenter: Add test case for (Common, Latin) to become Latin. Signed-off-by: Christian Parpart --- src/libunicode/script_segmenter_test.cpp | 27 ++++++++++++++++++------ 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/libunicode/script_segmenter_test.cpp b/src/libunicode/script_segmenter_test.cpp index b331055..77d2042 100644 --- a/src/libunicode/script_segmenter_test.cpp +++ b/src/libunicode/script_segmenter_test.cpp @@ -15,13 +15,9 @@ #include -#include #include using namespace std::string_view_literals; -using namespace std::string_view_literals; -using std::optional; -using unicode::script_segmenter; TEST_CASE("script_segmenter.private_use_area", "[script_segmenter]") { @@ -35,15 +31,32 @@ TEST_CASE("script_segmenter.private_use_area", "[script_segmenter]") CHECK(res1.script == unicode::Script::Unknown); } +TEST_CASE("script_segmenter.common_to_specific", "[script_segmenter]") +{ + // '1' is script property Common, 'a' is script property Latin, so the whole string is Latin. + + auto constexpr str = U"1a"sv; + auto seg = unicode::script_segmenter { str.data(), str.size() }; + + std::optional const r1 = seg.consume(); + REQUIRE(r1.has_value()); + auto const res1 = r1.value(); + CHECK(res1.size == str.size()); + CHECK(res1.script == unicode::Script::Latin); + + auto const r2 = seg.consume(); + REQUIRE_FALSE(r2.has_value()); +} + TEST_CASE("script_segmenter.greek_kanji_greek", "[script_segmenter]") { char32_t const* str = U"λ 合気道 λ;"; - auto seg = script_segmenter { str }; + auto seg = unicode::script_segmenter { str }; // greek text - optional const r1 = seg.consume(); + std::optional const r1 = seg.consume(); REQUIRE(r1.has_value()); - script_segmenter::result const res1 = r1.value(); + unicode::script_segmenter::result const res1 = r1.value(); CHECK(res1.size == 2); CHECK(res1.script == unicode::Script::Greek); From 48e78ba0b44931f39f31bed965633d25cb4df0b2 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Mon, 30 Sep 2024 14:21:48 +0200 Subject: [PATCH 11/11] [tests] Fix run_segmenter.CombiningCircle test Signed-off-by: Christian Parpart --- src/libunicode/run_segmenter_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libunicode/run_segmenter_test.cpp b/src/libunicode/run_segmenter_test.cpp index 133b2fd..e075d1c 100644 --- a/src/libunicode/run_segmenter_test.cpp +++ b/src/libunicode/run_segmenter_test.cpp @@ -144,7 +144,7 @@ TEST_CASE("run_segmenter.JapaneseHindiEmojiMix", "[run_segmenter]") TEST_CASE("run_segmenter.CombiningCircle", "[run_segmenter]") { - test_run_segmentation(__LINE__, { { U"◌́◌̀◌̈◌̂◌̄◌̊", Script::Common, PresentationStyle::Text } }); + test_run_segmentation(__LINE__, { { U"◌́◌̀◌̈◌̂◌̄◌̊", Script::Latin, PresentationStyle::Text } }); } TEST_CASE("run_segmenter.Arab_Hangul", "[run_segmenter]")