deps: update icu to 75.1

PR-URL: #52573 Reviewed-By: Richard Lau <[email protected]> Reviewed-By: Michaël Zasso <[email protected]> Reviewed-By: Mohammed Keyvanzadeh <[email protected]> Reviewed-By: Luigi Pinca <[email protected]>
nodejs · May 3, 2024 · a135027 · a135027
1 parent b49464b
commit a135027
Show file tree

Hide file tree

Showing 408 changed files with 25,369 additions and 12,777 deletions.
diff --git a/deps/icu-small/LICENSE b/deps/icu-small/LICENSE
@@ -2,7 +2,7 @@ UNICODE LICENSE V3
 
 COPYRIGHT AND PERMISSION NOTICE
 
-Copyright © 2016-2023 Unicode, Inc.
+Copyright © 2016-2024 Unicode, Inc.
 
 NOTICE TO USER: Carefully read the following legal agreement. BY
 DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
@@ -38,6 +38,8 @@ not be used in advertising or otherwise to promote the sale, use or other
 dealings in these Data Files or Software without prior written
 authorization of the copyright holder.
 
+SPDX-License-Identifier: Unicode-3.0
+
 ----------------------------------------------------------------------
 
 Third-Party Software Licenses

diff --git a/deps/icu-small/README-FULL-ICU.txt b/deps/icu-small/README-FULL-ICU.txt
@@ -1,8 +1,8 @@
 ICU sources - auto generated by shrink-icu-src.py
 
 This directory contains the ICU subset used by --with-intl=full-icu
-It is a strict subset of ICU 74 source files with the following exception(s):
-* deps/icu-small/source/data/in/icudt74l.dat.bz2 : compressed data file
+It is a strict subset of ICU 75 source files with the following exception(s):
+* deps/icu-small/source/data/in/icudt75l.dat.bz2 : compressed data file
 
 
 To rebuild this directory, see ../../tools/icu/README.md

diff --git a/deps/icu-small/source/common/brkeng.cpp b/deps/icu-small/source/common/brkeng.cpp
@@ -114,13 +114,11 @@ UnhandledEngine::handleCharacter(UChar32 c) {
  */
 
 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
-    fEngines = 0;
+    fEngines = nullptr;
 }
 
 ICULanguageBreakFactory::~ICULanguageBreakFactory() {
-    if (fEngines != 0) {
-        delete fEngines;
-    }
+    delete fEngines;
 }
 
 void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {

diff --git a/deps/icu-small/source/common/brkiter.cpp b/deps/icu-small/source/common/brkiter.cpp
@@ -438,17 +438,14 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
             uprv_strcpy(lb_lw, "line");
             UErrorCode kvStatus = U_ZERO_ERROR;
-            CharString value;
-            CharStringByteSink valueSink(&value);
-            loc.getKeywordValue("lb", valueSink, kvStatus);
+            auto value = loc.getKeywordValue<CharString>("lb", kvStatus);
             if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
                 uprv_strcat(lb_lw, "_");
                 uprv_strcat(lb_lw, value.data());
             }
             // lw=phrase is only supported in Japanese and Korean
             if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) {
-                value.clear();
-                loc.getKeywordValue("lw", valueSink, kvStatus);
+                value = loc.getKeywordValue<CharString>("lw", kvStatus);
                 if (U_SUCCESS(kvStatus) && value == "phrase") {
                     uprv_strcat(lb_lw, "_");
                     uprv_strcat(lb_lw, value.data());
@@ -500,7 +497,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
 Locale
 BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
     if (type == ULOC_REQUESTED_LOCALE) {
-        return Locale(requestLocale);
+        return {requestLocale};
     }
     U_LOCALE_BASED(locBased, *this);
     return locBased.getLocale(type, status);

diff --git a/deps/icu-small/source/common/bytesinkutil.h b/deps/icu-small/source/common/bytesinkutil.h
@@ -7,18 +7,52 @@
 #ifndef BYTESINKUTIL_H
 #define BYTESINKUTIL_H
 
+#include <type_traits>
+
 #include "unicode/utypes.h"
 #include "unicode/bytestream.h"
 #include "unicode/edits.h"
+#include "charstr.h"
 #include "cmemory.h"
 #include "uassert.h"
+#include "ustr_imp.h"
 
 U_NAMESPACE_BEGIN
 
 class ByteSink;
-class CharString;
 class Edits;
 
+class U_COMMON_API CharStringByteSink : public ByteSink {
+public:
+    CharStringByteSink(CharString* dest);
+    ~CharStringByteSink() override;
+
+    CharStringByteSink() = delete;
+    CharStringByteSink(const CharStringByteSink&) = delete;
+    CharStringByteSink& operator=(const CharStringByteSink&) = delete;
+
+    void Append(const char* bytes, int32_t n) override;
+
+    char* GetAppendBuffer(int32_t min_capacity,
+                          int32_t desired_capacity_hint,
+                          char* scratch,
+                          int32_t scratch_capacity,
+                          int32_t* result_capacity) override;
+
+private:
+    CharString& dest_;
+};
+
+// CharString doesn't provide the public API that StringByteSink requires a
+// string class to have so this template specialization replaces the default
+// implementation of StringByteSink<CharString> with CharStringByteSink.
+template<>
+class StringByteSink<CharString> : public CharStringByteSink {
+ public:
+  StringByteSink(CharString* dest) : CharStringByteSink(dest) { }
+  StringByteSink(CharString* dest, int32_t /*initialAppendCapacity*/) : CharStringByteSink(dest) { }
+};
+
 class U_COMMON_API ByteSinkUtil {
 public:
     ByteSinkUtil() = delete;  // all static
@@ -57,30 +91,64 @@ class U_COMMON_API ByteSinkUtil {
                                  ByteSink &sink, uint32_t options, Edits *edits,
                                  UErrorCode &errorCode);
 
-private:
-    static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
-                                        ByteSink &sink, uint32_t options, Edits *edits);
-};
-
-class U_COMMON_API CharStringByteSink : public ByteSink {
-public:
-    CharStringByteSink(CharString* dest);
-    ~CharStringByteSink() override;
-
-    CharStringByteSink() = delete;
-    CharStringByteSink(const CharStringByteSink&) = delete;
-    CharStringByteSink& operator=(const CharStringByteSink&) = delete;
-
-    void Append(const char* bytes, int32_t n) override;
+    /**
+     * Calls a lambda that writes to a ByteSink with a CheckedArrayByteSink
+     * and then returns through u_terminateChars(), in order to implement
+     * the classic ICU4C C API writing to a fix sized buffer on top of a
+     * contemporary C++ API.
+     *
+     * @param buffer receiving buffer
+     * @param capacity capacity of receiving buffer
+     * @param lambda that gets called with the sink as an argument
+     * @param status set to U_BUFFER_OVERFLOW_ERROR on overflow
+     * @return number of bytes written, or needed (in case of overflow)
+     * @internal
+     */
+    template <typename F,
+              typename = std::enable_if_t<
+                  std::is_invocable_r_v<void, F, ByteSink&, UErrorCode&>>>
+    static int32_t viaByteSinkToTerminatedChars(char* buffer, int32_t capacity,
+                                                F&& lambda,
+                                                UErrorCode& status) {
+        if (U_FAILURE(status)) { return 0; }
+        CheckedArrayByteSink sink(buffer, capacity);
+        lambda(sink, status);
+        if (U_FAILURE(status)) { return 0; }
+
+        int32_t reslen = sink.NumberOfBytesAppended();
+
+        if (sink.Overflowed()) {
+            status = U_BUFFER_OVERFLOW_ERROR;
+            return reslen;
+        }
+
+        return u_terminateChars(buffer, capacity, reslen, &status);
+    }
 
-    char* GetAppendBuffer(int32_t min_capacity,
-                          int32_t desired_capacity_hint,
-                          char* scratch,
-                          int32_t scratch_capacity,
-                          int32_t* result_capacity) override;
+    /**
+     * Calls a lambda that writes to a ByteSink with a CharStringByteSink and
+     * then returns a CharString, in order to implement a contemporary C++ API
+     * on top of a C/C++ compatibility ByteSink API.
+     *
+     * @param lambda that gets called with the sink as an argument
+     * @param status to check and report
+     * @return the resulting string, or an empty string (in case of error)
+     * @internal
+     */
+    template <typename F,
+              typename = std::enable_if_t<
+                  std::is_invocable_r_v<void, F, ByteSink&, UErrorCode&>>>
+    static CharString viaByteSinkToCharString(F&& lambda, UErrorCode& status) {
+        if (U_FAILURE(status)) { return {}; }
+        CharString result;
+        CharStringByteSink sink(&result);
+        lambda(sink, status);
+        return result;
+    }
 
 private:
-    CharString& dest_;
+    static void appendNonEmptyUnchanged(const uint8_t *s, int32_t length,
+                                        ByteSink &sink, uint32_t options, Edits *edits);
 };
 
 U_NAMESPACE_END

diff --git a/deps/icu-small/source/common/caniter.cpp b/deps/icu-small/source/common/caniter.cpp
@@ -64,6 +64,7 @@ U_NAMESPACE_BEGIN
 
 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)
 
+
 /**
  *@param source string to get results for
  */
@@ -73,10 +74,10 @@ CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode
     pieces_lengths(nullptr),
     current(nullptr),
     current_length(0),
-    nfd(*Normalizer2::getNFDInstance(status)),
-    nfcImpl(*Normalizer2Factory::getNFCImpl(status))
+    nfd(Normalizer2::getNFDInstance(status)),
+    nfcImpl(Normalizer2Factory::getNFCImpl(status))
 {
-    if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) {
+    if(U_SUCCESS(status) && nfcImpl->ensureCanonIterData(status)) {
       setSource(sourceStr, status);
     }
 }
@@ -172,7 +173,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
     int32_t i = 0;
     UnicodeString *list = nullptr;
 
-    nfd.normalize(newSource, source, status);
+    nfd->normalize(newSource, source, status);
     if(U_FAILURE(status)) {
       return;
     }
@@ -194,7 +195,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
         current[0] = 0;
         pieces[0] = new UnicodeString[1];
         pieces_lengths[0] = 1;
-        if (pieces[0] == 0) {
+        if (pieces[0] == nullptr) {
             status = U_MEMORY_ALLOCATION_ERROR;
             goto CleanPartialInitialization;
         }
@@ -203,7 +204,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
 
 
     list = new UnicodeString[source.length()];
-    if (list == 0) {
+    if (list == nullptr) {
         status = U_MEMORY_ALLOCATION_ERROR;
         goto CleanPartialInitialization;
     }
@@ -219,7 +220,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
     // on the NFD form - see above).
     for (; i < source.length(); i += U16_LENGTH(cp)) {
         cp = source.char32At(i);
-        if (nfcImpl.isCanonSegmentStarter(cp)) {
+        if (nfcImpl->isCanonSegmentStarter(cp)) {
             source.extract(start, i-start, list[list_length++]); // add up to i
             start = i;
         }
@@ -252,9 +253,7 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
     return;
 // Common section to cleanup all local variables and reset object variables.
 CleanPartialInitialization:
-    if (list != nullptr) {
-        delete[] list;
-    }
+    delete[] list;
     cleanPieces();
 }
 
@@ -264,10 +263,19 @@ void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &st
  * @param source the string to find permutations for
  * @return the results in a set.
  */
-void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
+void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status, int32_t depth) {
     if(U_FAILURE(status)) {
         return;
     }
+    // To avoid infinity loop caused by permute, we limit the depth of recursive
+    // call to permute and return U_UNSUPPORTED_ERROR.
+    // We know in some unit test we need at least 4. Set to 8 just in case some
+    // unforseen use cases.
+    constexpr int32_t kPermuteDepthLimit = 8;
+    if (depth > kPermuteDepthLimit) {
+        status = U_UNSUPPORTED_ERROR;
+        return;
+    }
     //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
     int32_t i = 0;
 
@@ -277,7 +285,7 @@ void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros
     if (source.length() <= 2 && source.countChar32() <= 1) {
         UnicodeString *toPut = new UnicodeString(source);
         /* test for nullptr */
-        if (toPut == 0) {
+        if (toPut == nullptr) {
             status = U_MEMORY_ALLOCATION_ERROR;
             return;
         }
@@ -311,7 +319,7 @@ void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros
 
         // see what the permutations of the characters before and after this one are
         //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
-        permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status);
+        permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status, depth+1);
         /* Test for buffer overflows */
         if(U_FAILURE(status)) {
             return;
@@ -346,7 +354,7 @@ UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, i
     Hashtable permutations(status);
     Hashtable basic(status);
     if (U_FAILURE(status)) {
-        return 0;
+        return nullptr;
     }
     result.setValueDeleter(uprv_deleteUObject);
     permutations.setValueDeleter(uprv_deleteUObject);
@@ -381,7 +389,7 @@ UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, i
             //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
             UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));
             UnicodeString attempt;
-            nfd.normalize(possible, attempt, status);
+            nfd->normalize(possible, attempt, status);
 
             // TODO: check if operator == is semanticaly the same as attempt.equals(segment)
             if (attempt==segment) {
@@ -399,15 +407,15 @@ UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, i
 
     /* Test for buffer overflows */
     if(U_FAILURE(status)) {
-        return 0;
+        return nullptr;
     }
     // convert into a String[] to clean up storage
     //String[] finalResult = new String[result.size()];
     UnicodeString *finalResult = nullptr;
     int32_t resultCount;
     if((resultCount = result.count()) != 0) {
         finalResult = new UnicodeString[resultCount];
-        if (finalResult == 0) {
+        if (finalResult == nullptr) {
             status = U_MEMORY_ALLOCATION_ERROR;
             return nullptr;
         }
@@ -448,7 +456,7 @@ Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const cha
     for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) {
         // see if any character is at the start of some decomposition
         U16_GET(segment, 0, i, segLen, cp);
-        if (!nfcImpl.getCanonStartSet(cp, starts)) {
+        if (!nfcImpl->getCanonStartSet(cp, starts)) {
             continue;
         }
         // if so, see which decompositions match
@@ -471,7 +479,7 @@ Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const cha
                 UnicodeString item = *((UnicodeString *)(ne->value.pointer));
                 UnicodeString *toAdd = new UnicodeString(prefix);
                 /* test for nullptr */
-                if (toAdd == 0) {
+                if (toAdd == nullptr) {
                     status = U_MEMORY_ALLOCATION_ERROR;
                     return nullptr;
                 }
@@ -509,7 +517,7 @@ Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, con
     UnicodeString temp(comp);
     int32_t inputLen=temp.length();
     UnicodeString decompString;
-    nfd.normalize(temp, decompString, status);
+    nfd->normalize(temp, decompString, status);
     if (U_FAILURE(status)) {
         return nullptr;
     }
@@ -573,7 +581,7 @@ Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, con
     // brute force approach
     // check to make sure result is canonically equivalent
     UnicodeString trial;
-    nfd.normalize(temp, trial, status);
+    nfd->normalize(temp, trial, status);
     if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) {
         return nullptr;
     }