diff --git a/icu4c/source/common/brkeng.cpp b/icu4c/source/common/brkeng.cpp index ce3d09cf23b8..c8442310b8b3 100644 --- a/icu4c/source/common/brkeng.cpp +++ b/icu4c/source/common/brkeng.cpp @@ -21,6 +21,7 @@ #include "unicode/uscript.h" #include "unicode/ucharstrie.h" #include "unicode/bytestrie.h" +#include "unicode/rbbi.h" #include "brkeng.h" #include "cmemory.h" @@ -70,19 +71,21 @@ UnhandledEngine::~UnhandledEngine() { } UBool -UnhandledEngine::handles(UChar32 c) const { +UnhandledEngine::handles(UChar32 c, const char* locale) const { + (void)locale; // Unused return fHandled && fHandled->contains(c); } int32_t UnhandledEngine::findBreaks( UText *text, - int32_t /* startPos */, + int32_t startPos, int32_t endPos, UVector32 &/*foundBreaks*/, UBool /* isPhraseBreaking */, UErrorCode &status) const { if (U_FAILURE(status)) return 0; - UChar32 c = utext_current32(text); + utext_setNativeIndex(text, startPos); + UChar32 c = utext_current32(text); while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) { utext_next32(text); // TODO: recast loop to work with post-increment operations. c = utext_current32(text); @@ -120,41 +123,39 @@ ICULanguageBreakFactory::~ICULanguageBreakFactory() { } } -U_NAMESPACE_END -U_CDECL_BEGIN -static void U_CALLCONV _deleteEngine(void *obj) { - delete (const icu::LanguageBreakEngine *) obj; +void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) { + static UMutex gBreakEngineMutex; + Mutex m(&gBreakEngineMutex); + if (fEngines == nullptr) { + LocalPointer engines(new UStack(uprv_deleteUObject, nullptr, status), status); + if (U_SUCCESS(status)) { + fEngines = engines.orphan(); + } + } } -U_CDECL_END -U_NAMESPACE_BEGIN const LanguageBreakEngine * -ICULanguageBreakFactory::getEngineFor(UChar32 c) { +ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) { const LanguageBreakEngine *lbe = nullptr; UErrorCode status = U_ZERO_ERROR; + ensureEngines(status); + if (U_FAILURE(status) ) { + // Note: no way to return error code to caller. + return nullptr; + } static UMutex gBreakEngineMutex; Mutex m(&gBreakEngineMutex); - - if (fEngines == nullptr) { - LocalPointer engines(new UStack(_deleteEngine, nullptr, status), status); - if (U_FAILURE(status) ) { - // Note: no way to return error code to caller. - return nullptr; - } - fEngines = engines.orphan(); - } else { - int32_t i = fEngines->size(); - while (--i >= 0) { - lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); - if (lbe != nullptr && lbe->handles(c)) { - return lbe; - } + int32_t i = fEngines->size(); + while (--i >= 0) { + lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); + if (lbe != nullptr && lbe->handles(c, locale)) { + return lbe; } } - + // We didn't find an engine. Create one. - lbe = loadEngineFor(c); + lbe = loadEngineFor(c, locale); if (lbe != nullptr) { fEngines->push((void *)lbe, status); } @@ -162,7 +163,7 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c) { } const LanguageBreakEngine * -ICULanguageBreakFactory::loadEngineFor(UChar32 c) { +ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) { UErrorCode status = U_ZERO_ERROR; UScriptCode code = uscript_getScript(c, &status); if (U_SUCCESS(status)) { @@ -299,6 +300,70 @@ ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { return nullptr; } + +void ICULanguageBreakFactory::addExternalEngine( + ExternalBreakEngine* external, UErrorCode& status) { + LocalPointer engine(external, status); + ensureEngines(status); + LocalPointer wrapper( + new BreakEngineWrapper(engine.orphan(), status), status); + static UMutex gBreakEngineMutex; + Mutex m(&gBreakEngineMutex); + fEngines->push(wrapper.getAlias(), status); + wrapper.orphan(); +} + +BreakEngineWrapper::BreakEngineWrapper( + ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) { +} + +BreakEngineWrapper::~BreakEngineWrapper() { +} + +UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const { + return delegate->isFor(c, locale); +} + +int32_t BreakEngineWrapper::findBreaks( + UText *text, + int32_t startPos, + int32_t endPos, + UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, + UErrorCode &status) const { + if (U_FAILURE(status)) return 0; + int32_t result = 0; + + // Find the span of characters included in the set. + // The span to break begins at the current position in the text, and + // extends towards the start or end of the text, depending on 'reverse'. + + utext_setNativeIndex(text, startPos); + int32_t start = (int32_t)utext_getNativeIndex(text); + int32_t current; + int32_t rangeStart; + int32_t rangeEnd; + UChar32 c = utext_current32(text); + while((current = (int32_t)utext_getNativeIndex(text)) < endPos && delegate->handles(c)) { + utext_next32(text); // TODO: recast loop for postincrement + c = utext_current32(text); + } + rangeStart = start; + rangeEnd = current; + int32_t beforeSize = foundBreaks.size(); + int32_t additionalCapacity = rangeEnd - rangeStart + 1; + // enlarge to contains (rangeEnd-rangeStart+1) more items + foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status); + if (U_FAILURE(status)) return 0; + foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity); + result = delegate->fillBreak(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize, + additionalCapacity, status); + if (U_FAILURE(status)) return 0; + foundBreaks.setSize(beforeSize + result); + utext_setNativeIndex(text, current); + return result; +} + U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ diff --git a/icu4c/source/common/brkeng.h b/icu4c/source/common/brkeng.h index 240dc8f4d344..42a3d697cfef 100644 --- a/icu4c/source/common/brkeng.h +++ b/icu4c/source/common/brkeng.h @@ -10,6 +10,7 @@ #ifndef BRKENG_H #define BRKENG_H +#include "unicode/umisc.h" #include "unicode/utypes.h" #include "unicode/uobject.h" #include "unicode/utext.h" @@ -21,6 +22,7 @@ class UnicodeSet; class UStack; class UVector32; class DictionaryMatcher; +class ExternalBreakEngine; /******************************************************************* * LanguageBreakEngine @@ -35,7 +37,7 @@ class DictionaryMatcher; *

LanguageBreakEngines should normally be implemented so as to * be shared between threads without locking.

*/ -class LanguageBreakEngine : public UMemory { +class LanguageBreakEngine : public UObject { public: /** @@ -54,10 +56,11 @@ class LanguageBreakEngine : public UMemory { * a particular kind of break.

* * @param c A character which begins a run that the engine might handle + * @param locale The locale. * @return true if this engine handles the particular character and break * type. */ - virtual UBool handles(UChar32 c) const = 0; + virtual UBool handles(UChar32 c, const char* locale) const = 0; /** *

Find any breaks within a run in the supplied text.

@@ -80,6 +83,35 @@ class LanguageBreakEngine : public UMemory { }; +/******************************************************************* + * BreakEngineWrapper + */ + +/** + *

BreakEngineWrapper implement LanguageBreakEngine by + * a thin wrapper that delegate the task to ExternalBreakEngine + *

+ */ +class BreakEngineWrapper : public LanguageBreakEngine { + public: + + BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status); + + virtual ~BreakEngineWrapper(); + + virtual UBool handles(UChar32 c, const char* locale) const override; + + virtual int32_t findBreaks( UText *text, + int32_t startPos, + int32_t endPos, + UVector32 &foundBreaks, + UBool isPhraseBreaking, + UErrorCode &status) const override; + + private: + LocalPointer delegate; +}; + /******************************************************************* * LanguageBreakFactory */ @@ -125,9 +157,10 @@ class LanguageBreakFactory : public UMemory { * * @param c A character that begins a run for which a LanguageBreakEngine is * sought. + * @param locale The locale. * @return A LanguageBreakEngine with the desired characteristics, or 0. */ - virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0; + virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0; }; @@ -174,10 +207,11 @@ class UnhandledEngine : public LanguageBreakEngine { * a particular kind of break.

* * @param c A character which begins a run that the engine might handle + * @param locale The locale. * @return true if this engine handles the particular character and break * type. */ - virtual UBool handles(UChar32 c) const override; + virtual UBool handles(UChar32 c, const char* locale) const override; /** *

Find any breaks within a run in the supplied text.

@@ -247,9 +281,18 @@ class ICULanguageBreakFactory : public LanguageBreakFactory { * * @param c A character that begins a run for which a LanguageBreakEngine is * sought. + * @param locale The locale. * @return A LanguageBreakEngine with the desired characteristics, or 0. */ - virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override; + virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override; + + /** + * Add and adopt the engine and return an URegistryKey. + * @param engine The ExternalBreakEngine to be added and adopt. The caller + * pass the ownership and should not release the memory after this. + * @param status the error code. + */ + virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status); protected: /** @@ -258,9 +301,10 @@ class ICULanguageBreakFactory : public LanguageBreakFactory { * * @param c A character that begins a run for which a LanguageBreakEngine is * sought. + * @param locale The locale. * @return A LanguageBreakEngine with the desired characteristics, or 0. */ - virtual const LanguageBreakEngine *loadEngineFor(UChar32 c); + virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale); /** *

Create a DictionaryMatcher for the specified script and break type.

@@ -269,6 +313,9 @@ class ICULanguageBreakFactory : public LanguageBreakFactory { * @return A DictionaryMatcher with the desired characteristics, or nullptr. */ virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script); + + private: + void ensureEngines(UErrorCode& status); }; U_NAMESPACE_END diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp index 41e4e0dff578..b452cf2c0500 100644 --- a/icu4c/source/common/brkiter.cpp +++ b/icu4c/source/common/brkiter.cpp @@ -27,6 +27,7 @@ #include "unicode/rbbi.h" #include "unicode/brkiter.h" #include "unicode/udata.h" +#include "unicode/uloc.h" #include "unicode/ures.h" #include "unicode/ustring.h" #include "unicode/filteredbrk.h" @@ -121,8 +122,11 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st // If there is a result, set the valid locale and actual locale, and the kind if (U_SUCCESS(status) && result != nullptr) { U_LOCALE_BASED(locBased, *(BreakIterator*)result); + locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), actualLocale.data()); + uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY); + result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate } ures_close(b); @@ -202,18 +206,20 @@ BreakIterator::getAvailableLocales(int32_t& count) BreakIterator::BreakIterator() { - *validLocale = *actualLocale = 0; + *validLocale = *actualLocale = *requestLocale = 0; } BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) { uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale)); uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale)); + uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale)); } BreakIterator &BreakIterator::operator =(const BreakIterator &other) { if (this != &other) { uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale)); uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale)); + uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale)); } return *this; } @@ -493,12 +499,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) Locale BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const { + if (type == ULOC_REQUESTED_LOCALE) { + return Locale(requestLocale); + } U_LOCALE_BASED(locBased, *this); return locBased.getLocale(type, status); } const char * BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const { + if (type == ULOC_REQUESTED_LOCALE) { + return requestLocale; + } U_LOCALE_BASED(locBased, *this); return locBased.getLocaleID(type, status); } diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp index 0e420c67c5d2..3d672c03bfb3 100644 --- a/icu4c/source/common/dictbe.cpp +++ b/icu4c/source/common/dictbe.cpp @@ -42,7 +42,7 @@ DictionaryBreakEngine::~DictionaryBreakEngine() { } UBool -DictionaryBreakEngine::handles(UChar32 c) const { +DictionaryBreakEngine::handles(UChar32 c, const char*) const { return fSet.contains(c); } @@ -54,13 +54,13 @@ DictionaryBreakEngine::findBreaks( UText *text, UBool isPhraseBreaking, UErrorCode& status) const { if (U_FAILURE(status)) return 0; - (void)startPos; // TODO: remove this param? int32_t result = 0; // Find the span of characters included in the set. // The span to break begins at the current position in the text, and // extends towards the start or end of the text, depending on 'reverse'. + utext_setNativeIndex(text, startPos); int32_t start = (int32_t)utext_getNativeIndex(text); int32_t current; int32_t rangeStart; diff --git a/icu4c/source/common/dictbe.h b/icu4c/source/common/dictbe.h index a2c761bdc3ac..e512071fa45d 100644 --- a/icu4c/source/common/dictbe.h +++ b/icu4c/source/common/dictbe.h @@ -62,10 +62,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine { * a particular kind of break.

* * @param c A character which begins a run that the engine might handle + * @param locale The locale. * @return true if this engine handles the particular character and break * type. */ - virtual UBool handles(UChar32 c) const override; + virtual UBool handles(UChar32 c, const char* locale) const override; /** *

Find any breaks within a run in the supplied text.

diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 73716ab4066c..599279fb72bb 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -1125,6 +1125,7 @@ static icu::UStack *gLanguageBreakFactories = nullptr; static const icu::UnicodeString *gEmptyString = nullptr; static icu::UInitOnce gLanguageBreakFactoriesInitOnce {}; static icu::UInitOnce gRBBIInitOnce {}; +static icu::ICULanguageBreakFactory *gICULanguageBreakFactory = nullptr; /** * Release all static memory held by breakiterator. @@ -1153,37 +1154,41 @@ static void U_CALLCONV rbbiInit() { ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup); } -static void U_CALLCONV initLanguageFactories() { - UErrorCode status = U_ZERO_ERROR; +static void U_CALLCONV initLanguageFactories(UErrorCode& status) { U_ASSERT(gLanguageBreakFactories == nullptr); gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status); if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) { - ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); - gLanguageBreakFactories->push(builtIn, status); + LocalPointer factory(new ICULanguageBreakFactory(status), status); + if (U_SUCCESS(status)) { + gICULanguageBreakFactory = factory.orphan(); + gLanguageBreakFactories->push(gICULanguageBreakFactory, status); #ifdef U_LOCAL_SERVICE_HOOK - LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); - if (extra != nullptr) { - gLanguageBreakFactories->push(extra, status); - } + LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); + if (extra != nullptr) { + gLanguageBreakFactories->push(extra, status); + } #endif + } } ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup); } +void ensureLanguageFactories(UErrorCode& status) { + umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories, status); +} static const LanguageBreakEngine* -getLanguageBreakEngineFromFactory(UChar32 c) +getLanguageBreakEngineFromFactory(UChar32 c, const char* locale) { - umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); - if (gLanguageBreakFactories == nullptr) { - return nullptr; - } + UErrorCode status = U_ZERO_ERROR; + ensureLanguageFactories(status); + if (U_FAILURE(status)) return nullptr; int32_t i = gLanguageBreakFactories->size(); const LanguageBreakEngine *lbe = nullptr; while (--i >= 0) { LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); - lbe = factory->getEngineFor(c); + lbe = factory->getEngineFor(c, locale); if (lbe != nullptr) { break; } @@ -1199,7 +1204,7 @@ getLanguageBreakEngineFromFactory(UChar32 c) // //------------------------------------------------------------------------------- const LanguageBreakEngine * -RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { +RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c, const char* locale) { const LanguageBreakEngine *lbe = nullptr; UErrorCode status = U_ZERO_ERROR; @@ -1215,14 +1220,14 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { int32_t i = fLanguageBreakEngines->size(); while (--i >= 0) { lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); - if (lbe->handles(c)) { + if (lbe->handles(c, locale)) { return lbe; } } // No existing dictionary took the character. See if a factory wants to // give us a new LanguageBreakEngine for this character. - lbe = getLanguageBreakEngineFromFactory(c); + lbe = getLanguageBreakEngineFromFactory(c, locale); // If we got one, use it and push it on our stack. if (lbe != nullptr) { @@ -1259,6 +1264,18 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { return fUnhandledBreakEngine; } +#ifndef U_HIDE_DRAFT_API +void U_EXPORT2 RuleBasedBreakIterator::registerExternalBreakEngine( + ExternalBreakEngine* toAdopt, UErrorCode& status) { + LocalPointer engine(toAdopt, status); + if (U_FAILURE(status)) return; + ensureLanguageFactories(status); + if (U_FAILURE(status)) return; + gICULanguageBreakFactory->addExternalEngine(engine.orphan(), status); +} +#endif /* U_HIDE_DRAFT_API */ + + void RuleBasedBreakIterator::dumpCache() { fBreakCache->dumpCache(); } diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp index 02ca555a890c..f7a283f69e45 100644 --- a/icu4c/source/common/rbbi_cache.cpp +++ b/icu4c/source/common/rbbi_cache.cpp @@ -158,12 +158,13 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo // We now have a dictionary character. Get the appropriate language object // to deal with it. - const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c); + const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine( + c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status)); // Ask the language object if there are any breaks. It will add them to the cache and // leave the text pointer on the other side of its range, ready to search for the next one. if (lbe != nullptr) { - foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); + foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); } // Reload the loop variables for the next go-round diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h index 108652799e69..1b10e6ef1165 100644 --- a/icu4c/source/common/unicode/brkiter.h +++ b/icu4c/source/common/unicode/brkiter.h @@ -649,6 +649,7 @@ class U_COMMON_API BreakIterator : public UObject { /** @internal (private) */ char actualLocale[ULOC_FULLNAME_CAPACITY]; char validLocale[ULOC_FULLNAME_CAPACITY]; + char requestLocale[ULOC_FULLNAME_CAPACITY]; }; #ifndef U_HIDE_DEPRECATED_API diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h index 418b52e41f42..c137ac5c7a85 100644 --- a/icu4c/source/common/unicode/rbbi.h +++ b/icu4c/source/common/unicode/rbbi.h @@ -43,6 +43,71 @@ class RBBIDataWrapper; class UnhandledEngine; class UStack; + +#ifndef U_HIDE_DRAFT_API +#if !UCONFIG_NO_SERVICE +/** + * The ExternalBreakEngine class define an abstract interface for the host environment + * to provide a low level facility to break text for unicode text in script that the text boundary + * cannot be handled by upper level rule based logic, for example, for Chinese and Japanese + * word breaking, Thai, Khmer, Burmese, Lao and other Southeast Asian scripts. + * The host environment implement one or more subclass of ExternalBreakEngine and + * register them in the initialization time by calling + * RuleBasedBreakIterator::registerExternalBreakEngine(). ICU adopt and own the engine and will + * delete the registered external engine in proper time during the clean up + * event. + * @internal ICU 74 technology preview + */ +class ExternalBreakEngine : public UObject { + public: + /** + * destructor + * @internal ICU 74 technology preview + */ + virtual ~ExternalBreakEngine() {} + + /** + *

Indicate whether this engine handles a particular character when + * the RuleBasedBreakIterator is used for a particular locale. This method is used + * by the RuleBasedBreakIterator to find a break engine.

+ * @param c A character which begins a run that the engine might handle. + * @param locale The locale. + * @return true if this engine handles the particular character for that locale. + * @internal ICU 74 technology preview + */ + virtual bool isFor(UChar32 c, const char* locale) const = 0; + + /** + *

Indicate whether this engine handles a particular character.This method is + * used by the RuleBasedBreakIterator after it already find a break engine to see which + * characters after the first one can be handled by this break engine.

+ * @param c A character that the engine might handle. + * @return true if this engine handles the particular character. + * @internal ICU 74 technology preview + */ + virtual bool handles(UChar32 c) const = 0; + + /** + *

Divide up a range of text handled by this break engine.

+ * + * @param text A UText representing the text + * @param start The start of the range of known characters + * @param end The end of the range of known characters + * @param foundBreaks Output of C array of int32_t break positions, or + * nullptr + * @param foundBreaksCapacity The capacity of foundBreaks + * @param status Information on any errors encountered. + * @return The number of breaks found + * @internal ICU 74 technology preview + */ + virtual int32_t fillBreak(UText* text, int32_t start, int32_t end, + int32_t* foundBreaks, int32_t foundBreaksCapacity, + UErrorCode& status) const = 0; +}; +#endif /* UCONFIG_NO_SERVICE */ +#endif /* U_HIDE_DRAFT_API */ + + /** * * A subclass of BreakIterator whose behavior is specified using a list of rules. @@ -716,9 +781,10 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator { * This function returns the appropriate LanguageBreakEngine for a * given character c. * @param c A character in the dictionary set + * @param locale The locale. * @internal (private) */ - const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c); + const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale); public: #ifndef U_HIDE_INTERNAL_API @@ -734,8 +800,26 @@ class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator { */ void dumpTables(); #endif /* U_HIDE_INTERNAL_API */ + +#ifndef U_HIDE_DRAFT_API +#if !UCONFIG_NO_SERVICE + /** + * Register a new external break engine. The external break engine will be adopted. + * Because ICU may choose to cache break engine internally, this must + * be called at application startup, prior to any calls to + * object methods of RuleBasedBreakIterator to avoid undefined behavior. + * @param toAdopt the ExternalBreakEngine instance to be adopted + * @param status the in/out status code, no special meanings are assigned + * @internal ICU 74 technology preview + */ + static void U_EXPORT2 registerExternalBreakEngine( + ExternalBreakEngine* toAdopt, UErrorCode& status); +#endif /* UCONFIG_NO_SERVICE */ +#endif /* U_HIDE_DRAFT_API */ + }; + U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ diff --git a/icu4c/source/test/intltest/lstmbetst.cpp b/icu4c/source/test/intltest/lstmbetst.cpp index 0e3fe8c3f9e4..d1f28223b3c7 100644 --- a/icu4c/source/test/intltest/lstmbetst.cpp +++ b/icu4c/source/test/intltest/lstmbetst.cpp @@ -73,7 +73,7 @@ UScriptCode getScriptFromModelName(const std::string& modelName) { // the model. Since by default the LSTM models are not included, all the tested // models need to be included under source/test/testdata. -void LSTMBETest::runTestFromFile(const char* filename) { +void LSTMBETest::runTestFromFile(const char* filename, const char* locale) { UErrorCode status = U_ZERO_ERROR; LocalPointer engine; // Open and read the test data file. @@ -123,7 +123,7 @@ void LSTMBETest::runTestFromFile(const char* filename) { caseNum++; bool canHandleAllChars = true; for (int32_t i = 0; i < value.length(); i++) { - if (!engine->handles(value.charAt(i))) { + if (!engine->handles(value.charAt(i), locale)) { errln(UnicodeString("Test Case#") + caseNum + " contains char '" + UnicodeString(value.charAt(i)) + "' cannot be handled by the engine in offset " + i + "\n" + line); @@ -200,15 +200,15 @@ void LSTMBETest::runTestFromFile(const char* filename) { } void LSTMBETest::TestThaiGraphclust() { - runTestFromFile("Thai_graphclust_model4_heavy_Test.txt"); + runTestFromFile("Thai_graphclust_model4_heavy_Test.txt", "th"); } void LSTMBETest::TestThaiCodepoints() { - runTestFromFile("Thai_codepoints_exclusive_model5_heavy_Test.txt"); + runTestFromFile("Thai_codepoints_exclusive_model5_heavy_Test.txt", "th"); } void LSTMBETest::TestBurmeseGraphclust() { - runTestFromFile("Burmese_graphclust_model5_heavy_Test.txt"); + runTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", "my"); } const LanguageBreakEngine* LSTMBETest::createEngineFromTestData( diff --git a/icu4c/source/test/intltest/lstmbetst.h b/icu4c/source/test/intltest/lstmbetst.h index a5da3c9e83bc..0bc00578e660 100644 --- a/icu4c/source/test/intltest/lstmbetst.h +++ b/icu4c/source/test/intltest/lstmbetst.h @@ -40,7 +40,7 @@ class LSTMBETest: public IntlTest { private: const LanguageBreakEngine* createEngineFromTestData(const char* model, UScriptCode script, UErrorCode& status); - void runTestFromFile(const char* filename); + void runTestFromFile(const char* filename, const char* locale); void runTestWithLargeMemory(const char* model, UScriptCode script); // Test parameters, from the test framework and test invocation. diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 05572cbd0878..c7687bad5032 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -142,6 +142,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(TestLSTMThai); TESTCASE_AUTO(TestLSTMBurmese); TESTCASE_AUTO(TestRandomAccess); + TESTCASE_AUTO(TestExternalBreakEngineWithFakeTaiLe); + TESTCASE_AUTO(TestExternalBreakEngineWithFakeYue); #if U_ENABLE_TRACING TESTCASE_AUTO(TestTraceCreateCharacter); @@ -5667,4 +5669,192 @@ void RBBITest::TestRandomAccess() { } } +// A Fake Tai Le break engine which handle Unicode Tai Le (Tale) block +// https://unicode.org/charts/PDF/U1950.pdf +// U+1950 - U+197F and always break after Tone letters (U+1970-U+1974) +class FakeTaiLeBreakEngine : public ExternalBreakEngine { + public: + FakeTaiLeBreakEngine() : block(0x1950, 0x197f), tones(0x1970, 0x1974) { + } + virtual ~FakeTaiLeBreakEngine() { + } + virtual bool isFor(UChar32 c, const char* /* locale */) const override { + // We implmement this for any locale, not return false for some langauge + // here. + return handles(c); + } + virtual bool handles(UChar32 c) const override { + return block.contains(c); + } + virtual int32_t fillBreak(UText* text, int32_t start, int32_t end, + int32_t* foundBreaks, int32_t foundBreaksCapacity, + UErrorCode& status) const override { + if (U_FAILURE(status)) return 0; + int32_t i = 0; + // Save the state of the utext + int64_t savedIndex = utext_getNativeIndex(text); + if (savedIndex != start) { + utext_setNativeIndex(text, start); + } + int32_t current; + while((current = (int32_t)utext_getNativeIndex(text)) < end) { + UChar32 c = utext_current32(text); + // Break after tone marks as a fake break point. + if (tones.contains(c)) { + if (i >= foundBreaksCapacity) { + status = U_BUFFER_OVERFLOW_ERROR; + utext_setNativeIndex(text, savedIndex); + return i; + } + foundBreaks[i++] = current; + } + UTEXT_NEXT32(text); + } + // Restore the utext + if (savedIndex != current) { + utext_setNativeIndex(text, savedIndex); + } + return i; + } + + private: + UnicodeSet block; + UnicodeSet tones; +}; + +// A Fake Yue Break Engine which handle CJK Unified Ideographs +// block (U+4E00-U+9FFF) when locale start with 'yue' and break +// after every character. +class FakeYueBreakEngine : public ExternalBreakEngine { + public: + FakeYueBreakEngine() : block(0x4e00, 0x9FFF) { + } + virtual ~FakeYueBreakEngine() { + } + virtual bool isFor(UChar32 c, const char* locale) const override { + // We implmement this for any locale starts with "yue" such as + // "yue", "yue-CN", "yue-Hant-CN", etc. + return handles(c) && uprv_strncmp("yue", locale, 3) == 0; + } + virtual bool handles(UChar32 c) const override { + return block.contains(c); + } + virtual int32_t fillBreak(UText* text, int32_t start, int32_t end, + int32_t* foundBreaks, int32_t foundBreaksCapacity, + UErrorCode& status) const override { + (void)text; + if (U_FAILURE(status)) return 0; + int32_t i = 0; + int32_t current = start; + while (current++ < end) { + // A fake word segmentation by breaking every two Unicode. + if ((current - start) % 2 == 0) { + if (i >= foundBreaksCapacity) { + status = U_BUFFER_OVERFLOW_ERROR; + return i; + } + foundBreaks[i++] = current; + } + } + return i; + } + + private: + UnicodeSet block; +}; + +void RBBITest::TestExternalBreakEngineWithFakeYue() { + UErrorCode status = U_ZERO_ERROR; + UnicodeString text(u"a bc def一兩年前佢真係唔鍾意畀我影相i jk lmn"); + + std::vector actual1; + { + LocalPointer bi1( + BreakIterator::createWordInstance(Locale::getRoot(), status), + status); + bi1->setText(text); + assertTrue(WHERE "BreakIterator::createWordInstance( root )", + U_SUCCESS(status)); + + do { + actual1.push_back(bi1->current()); + } while(bi1->next() != BreakIterator::DONE); + } + + std::vector expected1({{ 0, 1, 2, 4, 5, 8, 10, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 30}}); + assertTrue("root break Yue as Chinese", expected1 == actual1); + + status = U_ZERO_ERROR; + RuleBasedBreakIterator::registerExternalBreakEngine( + new FakeYueBreakEngine(), status); + assertTrue(WHERE "registerExternalBreakEngine w FakeYueBreakEngine", + U_SUCCESS(status)); + + std::vector actual2; + { + status = U_ZERO_ERROR; + LocalPointer bi2( + BreakIterator::createWordInstance(Locale("yue"), status), status); + assertTrue(WHERE "BreakIterator::createWordInstance( yue )", + U_SUCCESS(status)); + bi2->setText(text); + do { + actual2.push_back(bi2->current()); + } while(bi2->next() != BreakIterator::DONE); + } + std::vector expected2({{ 0, 1, 2, 4, 5, 8, 10, 12, 14, 16, 18, 20, + 22, 23, 24, 26, 27, 30}}); + assertTrue(WHERE "break Yue by Fake external breaker", + expected2 == actual2); +} + +void RBBITest::TestExternalBreakEngineWithFakeTaiLe() { + UErrorCode status = U_ZERO_ERROR; + UnicodeString text( + u"a bc defᥛᥫᥒᥰᥖᥭᥰᥞᥝᥰᥙᥥᥢᥛᥫᥒᥰᥑᥩᥢᥲᥔᥣᥝᥴᥓᥬᥖᥩᥢᥲᥛᥣᥝᥱᥙᥝᥱᥙᥤᥱᥓᥣᥒᥛᥣᥰᥓᥧ" + u"ᥰᥘᥩᥰᥗᥪᥒᥴᥛᥣᥰᥘᥬᥰᥝᥣᥱᥘᥒᥱᥔᥣᥛᥴᥘᥫᥢi jk lmn"); + + std::vector actual1; + { + LocalPointer bi1( + BreakIterator::createLineInstance(Locale::getRoot(), status), + status); + bi1->setText(text); + assertTrue(WHERE "BreakIterator::createLineInstance( root )", + U_SUCCESS(status)); + + do { + actual1.push_back(bi1->current()); + } while(bi1->next() != BreakIterator::DONE); + } + + std::vector expected1({{ + 0, 2, 5, 86, 89, 92 }}); + assertTrue(WHERE "root break Tai Le", expected1 == actual1); + + RuleBasedBreakIterator::registerExternalBreakEngine( + new FakeTaiLeBreakEngine(), status); + assertTrue(WHERE "registerExternalBreakEngine w FakeTaiLeBreakEngine", + U_SUCCESS(status)); + + std::vector actual2; + { + status = U_ZERO_ERROR; + LocalPointer bi2( + BreakIterator::createLineInstance(Locale("tdd"), status), status); + assertTrue(WHERE "BreakIterator::createLineInstance( tdd )", + U_SUCCESS(status)); + bi2->setText(text); + do { + actual2.push_back(bi2->current()); + } while(bi2->next() != BreakIterator::DONE); + } + std::vector expected2({{ + 0, 2, 5, 11, 14, 17, 24, 28, 32, 38, 42, 45, 48, 54, 57, 60, 64, 67, + 70, 73, 76, 80, 86, 89, 92}}); + assertTrue("break Tai Le by Fake external breaker", + expected2 == actual2); +} + #endif // #if !UCONFIG_NO_BREAK_ITERATION diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index c8785f723439..537a537863ad 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -96,6 +96,8 @@ class RBBITest: public IntlTest { void TestLSTMThai(); void TestLSTMBurmese(); void TestRandomAccess(); + void TestExternalBreakEngineWithFakeTaiLe(); + void TestExternalBreakEngineWithFakeYue(); #if U_ENABLE_TRACING void TestTraceCreateCharacter();