Skip to content

Commit

Permalink
ICU-22342 Implement ExternalBreakEngineAPI
Browse files Browse the repository at this point in the history
ICU-22342 Fix comments
  • Loading branch information
FrankYFTang committed Aug 30, 2023
1 parent 2207e2c commit 02d5e71
Show file tree
Hide file tree
Showing 13 changed files with 484 additions and 64 deletions.
121 changes: 93 additions & 28 deletions icu4c/source/common/brkeng.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "unicode/uscript.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/rbbi.h"

#include "brkeng.h"
#include "cmemory.h"
Expand Down Expand Up @@ -70,19 +71,21 @@ UnhandledEngine::~UnhandledEngine() {
}

UBool
UnhandledEngine::handles(UChar32 c) const {
UnhandledEngine::handles(UChar32 c, const char* locale) const {
(void)locale; // Unused
return fHandled && fHandled->contains(c);
}

int32_t
UnhandledEngine::findBreaks( UText *text,
int32_t /* startPos */,
int32_t startPos,
int32_t endPos,
UVector32 &/*foundBreaks*/,
UBool /* isPhraseBreaking */,
UErrorCode &status) const {
if (U_FAILURE(status)) return 0;
UChar32 c = utext_current32(text);
utext_setNativeIndex(text, startPos);
UChar32 c = utext_current32(text);
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
utext_next32(text); // TODO: recast loop to work with post-increment operations.
c = utext_current32(text);
Expand Down Expand Up @@ -120,49 +123,47 @@ ICULanguageBreakFactory::~ICULanguageBreakFactory() {
}
}

U_NAMESPACE_END
U_CDECL_BEGIN
static void U_CALLCONV _deleteEngine(void *obj) {
delete (const icu::LanguageBreakEngine *) obj;
void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
if (fEngines == nullptr) {
LocalPointer<UStack> engines(new UStack(uprv_deleteUObject, nullptr, status), status);
if (U_SUCCESS(status)) {
fEngines = engines.orphan();
}
}
}
U_CDECL_END
U_NAMESPACE_BEGIN

const LanguageBreakEngine *
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
const LanguageBreakEngine *lbe = nullptr;
UErrorCode status = U_ZERO_ERROR;
ensureEngines(status);
if (U_FAILURE(status) ) {
// Note: no way to return error code to caller.
return nullptr;
}

static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);

if (fEngines == nullptr) {
LocalPointer<UStack> engines(new UStack(_deleteEngine, nullptr, status), status);
if (U_FAILURE(status) ) {
// Note: no way to return error code to caller.
return nullptr;
}
fEngines = engines.orphan();
} else {
int32_t i = fEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
if (lbe != nullptr && lbe->handles(c)) {
return lbe;
}
int32_t i = fEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
if (lbe != nullptr && lbe->handles(c, locale)) {
return lbe;
}
}

// We didn't find an engine. Create one.
lbe = loadEngineFor(c);
lbe = loadEngineFor(c, locale);
if (lbe != nullptr) {
fEngines->push((void *)lbe, status);
}
return U_SUCCESS(status) ? lbe : nullptr;
}

const LanguageBreakEngine *
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_SUCCESS(status)) {
Expand Down Expand Up @@ -299,6 +300,70 @@ ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
return nullptr;
}


void ICULanguageBreakFactory::addExternalEngine(
ExternalBreakEngine* external, UErrorCode& status) {
LocalPointer<ExternalBreakEngine> engine(external, status);
ensureEngines(status);
LocalPointer<BreakEngineWrapper> wrapper(
new BreakEngineWrapper(engine.orphan(), status), status);
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
fEngines->push(wrapper.getAlias(), status);
wrapper.orphan();
}

BreakEngineWrapper::BreakEngineWrapper(
ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) {
}

BreakEngineWrapper::~BreakEngineWrapper() {
}

UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const {
return delegate->isFor(c, locale);
}

int32_t BreakEngineWrapper::findBreaks(
UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool /* isPhraseBreaking */,
UErrorCode &status) const {
if (U_FAILURE(status)) return 0;
int32_t result = 0;

// Find the span of characters included in the set.
// The span to break begins at the current position in the text, and
// extends towards the start or end of the text, depending on 'reverse'.

utext_setNativeIndex(text, startPos);
int32_t start = (int32_t)utext_getNativeIndex(text);
int32_t current;
int32_t rangeStart;
int32_t rangeEnd;
UChar32 c = utext_current32(text);
while((current = (int32_t)utext_getNativeIndex(text)) < endPos && delegate->handles(c)) {
utext_next32(text); // TODO: recast loop for postincrement
c = utext_current32(text);
}
rangeStart = start;
rangeEnd = current;
int32_t beforeSize = foundBreaks.size();
int32_t additionalCapacity = rangeEnd - rangeStart + 1;
// enlarge to contains (rangeEnd-rangeStart+1) more items
foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status);
if (U_FAILURE(status)) return 0;
foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity);
result = delegate->fillBreak(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize,
additionalCapacity, status);
if (U_FAILURE(status)) return 0;
foundBreaks.setSize(beforeSize + result);
utext_setNativeIndex(text, current);
return result;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
59 changes: 53 additions & 6 deletions icu4c/source/common/brkeng.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#ifndef BRKENG_H
#define BRKENG_H

#include "unicode/umisc.h"
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/utext.h"
Expand All @@ -21,6 +22,7 @@ class UnicodeSet;
class UStack;
class UVector32;
class DictionaryMatcher;
class ExternalBreakEngine;

/*******************************************************************
* LanguageBreakEngine
Expand All @@ -35,7 +37,7 @@ class DictionaryMatcher;
* <p>LanguageBreakEngines should normally be implemented so as to
* be shared between threads without locking.</p>
*/
class LanguageBreakEngine : public UMemory {
class LanguageBreakEngine : public UObject {
public:

/**
Expand All @@ -54,10 +56,11 @@ class LanguageBreakEngine : public UMemory {
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c) const = 0;
virtual UBool handles(UChar32 c, const char* locale) const = 0;

/**
* <p>Find any breaks within a run in the supplied text.</p>
Expand All @@ -80,6 +83,35 @@ class LanguageBreakEngine : public UMemory {

};

/*******************************************************************
* BreakEngineWrapper
*/

/**
* <p>BreakEngineWrapper implement LanguageBreakEngine by
* a thin wrapper that delegate the task to ExternalBreakEngine
* </p>
*/
class BreakEngineWrapper : public LanguageBreakEngine {
public:

BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status);

virtual ~BreakEngineWrapper();

virtual UBool handles(UChar32 c, const char* locale) const override;

virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode &status) const override;

private:
LocalPointer<ExternalBreakEngine> delegate;
};

/*******************************************************************
* LanguageBreakFactory
*/
Expand Down Expand Up @@ -125,9 +157,10 @@ class LanguageBreakFactory : public UMemory {
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0;

};

Expand Down Expand Up @@ -174,10 +207,11 @@ class UnhandledEngine : public LanguageBreakEngine {
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c) const override;
virtual UBool handles(UChar32 c, const char* locale) const override;

/**
* <p>Find any breaks within a run in the supplied text.</p>
Expand Down Expand Up @@ -247,9 +281,18 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override;
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override;

/**
* Add and adopt the engine and return an URegistryKey.
* @param engine The ExternalBreakEngine to be added and adopt. The caller
* pass the ownership and should not release the memory after this.
* @param status the error code.
*/
virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status);

protected:
/**
Expand All @@ -258,9 +301,10 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale);

/**
* <p>Create a DictionaryMatcher for the specified script and break type.</p>
Expand All @@ -269,6 +313,9 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
* @return A DictionaryMatcher with the desired characteristics, or nullptr.
*/
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);

private:
void ensureEngines(UErrorCode& status);
};

U_NAMESPACE_END
Expand Down
14 changes: 13 additions & 1 deletion icu4c/source/common/brkiter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "unicode/rbbi.h"
#include "unicode/brkiter.h"
#include "unicode/udata.h"
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "unicode/ustring.h"
#include "unicode/filteredbrk.h"
Expand Down Expand Up @@ -121,8 +122,11 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
// If there is a result, set the valid locale and actual locale, and the kind
if (U_SUCCESS(status) && result != nullptr) {
U_LOCALE_BASED(locBased, *(BreakIterator*)result);

locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
actualLocale.data());
uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
}

ures_close(b);
Expand Down Expand Up @@ -202,18 +206,20 @@ BreakIterator::getAvailableLocales(int32_t& count)

BreakIterator::BreakIterator()
{
*validLocale = *actualLocale = 0;
*validLocale = *actualLocale = *requestLocale = 0;
}

BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
}

BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
if (this != &other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
}
return *this;
}
Expand Down Expand Up @@ -493,12 +499,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)

Locale
BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
if (type == ULOC_REQUESTED_LOCALE) {
return Locale(requestLocale);
}
U_LOCALE_BASED(locBased, *this);
return locBased.getLocale(type, status);
}

const char *
BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
if (type == ULOC_REQUESTED_LOCALE) {
return requestLocale;
}
U_LOCALE_BASED(locBased, *this);
return locBased.getLocaleID(type, status);
}
Expand Down
Loading

0 comments on commit 02d5e71

Please sign in to comment.