From d3de9272b9d749f74da47753ad70d3588e533ab2 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 8 Oct 2024 11:18:20 -0700 Subject: [PATCH 01/43] ICU-22789 Add initial code for Segmenter interface, concrete impl, Segments class --- .../com/ibm/icu/text/LocalizedSegmenter.java | 54 +++++++++++++++++++ .../main/java/com/ibm/icu/text/Segmenter.java | 5 ++ .../main/java/com/ibm/icu/text/Segments.java | 42 +++++++++++++++ 3 files changed, 101 insertions(+) create mode 100644 icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java create mode 100644 icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java create mode 100644 icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java new file mode 100644 index 000000000000..8579678a025b --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java @@ -0,0 +1,54 @@ +package com.ibm.icu.text; + +import com.ibm.icu.util.ULocale; + +public class LocalizedSegmenter implements Segmenter { + + private ULocale locale; + + private SegmentationType segmentationType; + + public enum SegmentationType { + CHARACTER, + WORD, + LINE, + SENTENCE, + // TITLE, + // COUNT + } + + public Builder builder() { + return new Builder(); + } + + LocalizedSegmenter(ULocale locale, SegmentationType segmentationType) { + this.locale = locale; + this.segmentationType = segmentationType; + } + + public static class Builder { + + private ULocale locale = ULocale.ROOT; + + private SegmentationType segmentationType = SegmentationType.CHARACTER; + + public Builder() { + } + + public Builder setLocale(ULocale locale) { + this.locale = locale; + return this; + } + + public Builder setSegmentationType(SegmentationType segmentationType) { + this.segmentationType = segmentationType; + return this; + } + + public LocalizedSegmenter build() { + return new LocalizedSegmenter(locale, segmentationType); + } + + } + +} diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java new file mode 100644 index 000000000000..8e1ad367167d --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java @@ -0,0 +1,5 @@ +package com.ibm.icu.text; + +public interface Segmenter { + Segments segment(String s); +} diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java new file mode 100644 index 000000000000..4f355cf5a58c --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java @@ -0,0 +1,42 @@ +package com.ibm.icu.text; + +import java.util.stream.Stream; + +public class Segments { + + private String sourceString; + + private Segmenter segmenter; + + public Segments(String sourceString, Segmenter segmenter) { + this.sourceString = sourceString; + this.segmenter = segmenter; + } + + public Stream subSequences() { + return ranges().map((range) -> sourceString.subSequence(range.getStart(), range.getLimit())); + } + + public Stream ranges() { + return null; + } + + public static class SegmentRange { + int start; + int limit; + + public SegmentRange(int start, int limit) { + this.start = start; + this.limit = limit; + } + + public int getStart() { + return start; + } + + public int getLimit(){ + return limit; + } + } + +} From 6ce5fdce8d7989648c1da034dc4e2945be11cf0f Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 8 Oct 2024 14:18:31 -0700 Subject: [PATCH 02/43] ICU-22789 Finish impl for LocalizedSegmenter --- .../com/ibm/icu/text/LocalizedSegmenter.java | 79 ++++++++++++++++++- .../com/ibm/icu/text/RuleBasedSegmenter.java | 2 + .../main/java/com/ibm/icu/text/Segments.java | 21 ++--- .../dev/test/text/LocalizedSegmenterTest.java | 48 +++++++++++ .../dev/test/text/RuleBasedSegmenterTest.java | 5 ++ .../ibm/icu/dev/test/text/SegmentsTest.java | 69 ++++++++++++++++ 6 files changed, 205 insertions(+), 19 deletions(-) create mode 100644 icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java create mode 100644 icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/LocalizedSegmenterTest.java create mode 100644 icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/RuleBasedSegmenterTest.java create mode 100644 icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java index 8579678a025b..7fb5397dfa51 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java @@ -1,6 +1,7 @@ package com.ibm.icu.text; import com.ibm.icu.util.ULocale; +import java.util.stream.Stream; public class LocalizedSegmenter implements Segmenter { @@ -8,6 +9,19 @@ public class LocalizedSegmenter implements Segmenter { private SegmentationType segmentationType; + @Override + public Segments segment(String s) { + return new LocalizedSegments(s, this); + } + + public ULocale getLocale() { + return this.locale; + } + + public SegmentationType getSegmentationType() { + return this.segmentationType; + } + public enum SegmentationType { CHARACTER, WORD, @@ -17,7 +31,7 @@ public enum SegmentationType { // COUNT } - public Builder builder() { + public static Builder builder() { return new Builder(); } @@ -26,14 +40,33 @@ public Builder builder() { this.segmentationType = segmentationType; } + BreakIterator getBreakIterator() { + BreakIterator breakIter; + switch (this.segmentationType) { + case LINE: + breakIter = BreakIterator.getLineInstance(this.locale); + break; + case SENTENCE: + breakIter = BreakIterator.getSentenceInstance(this.locale); + break; + case WORD: + breakIter = BreakIterator.getWordInstance(this.locale); + break; + case CHARACTER: + default: + breakIter = BreakIterator.getCharacterInstance(this.locale); + break; + } + return breakIter; + } + public static class Builder { private ULocale locale = ULocale.ROOT; private SegmentationType segmentationType = SegmentationType.CHARACTER; - public Builder() { - } + Builder() { } public Builder setLocale(ULocale locale) { this.locale = locale; @@ -46,9 +79,47 @@ public Builder setSegmentationType(SegmentationType segmentationType) { } public LocalizedSegmenter build() { - return new LocalizedSegmenter(locale, segmentationType); + return new LocalizedSegmenter(this.locale, this.segmentationType); } } + public static class LocalizedSegments implements Segments { + + private String source; + + private LocalizedSegmenter segmenter; + + private LocalizedSegments(String source, LocalizedSegmenter segmenter) { + this.source = source; + this.segmenter = segmenter; + } + + @Override + public String getSourceString() { + return source; + } + + @Override + public Stream ranges() { + BreakIterator breakIter = this.segmenter.getBreakIterator(); + breakIter.setText(this.source); + + int start = breakIter.first(); + int limit = breakIter.next(); + if (limit == BreakIterator.DONE) { + return Stream.empty(); + } else { + Stream.Builder streamBuilder = Stream.builder(); + while (limit != BreakIterator.DONE) { + SegmentRange range = new SegmentRange(start, limit); + streamBuilder.add(range); + start = limit; + limit = breakIter.next(); + } + return streamBuilder.build(); + } + } + } + } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java new file mode 100644 index 000000000000..e82392bdd1cc --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java @@ -0,0 +1,2 @@ +package com.ibm.icu.text;public class RuleBasedSegmenter { +} diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java index 4f355cf5a58c..e4922007b5df 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java @@ -2,26 +2,17 @@ import java.util.stream.Stream; -public class Segments { +public interface Segments { - private String sourceString; + String getSourceString(); - private Segmenter segmenter; - - public Segments(String sourceString, Segmenter segmenter) { - this.sourceString = sourceString; - this.segmenter = segmenter; - } - - public Stream subSequences() { - return ranges().map((range) -> sourceString.subSequence(range.getStart(), range.getLimit())); + default Stream subSequences() { + return ranges().map((range) -> getSourceString().subSequence(range.getStart(), range.getLimit())); } - public Stream ranges() { - return null; - } + Stream ranges(); - public static class SegmentRange { + class SegmentRange { int start; int limit; diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/LocalizedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/LocalizedSegmenterTest.java new file mode 100644 index 000000000000..66eef38b563e --- /dev/null +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/LocalizedSegmenterTest.java @@ -0,0 +1,48 @@ +package com.ibm.icu.dev.test.text; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +import com.ibm.icu.dev.test.CoreTestFmwk; +import com.ibm.icu.text.LocalizedSegmenter; +import com.ibm.icu.text.LocalizedSegmenter.SegmentationType; +import com.ibm.icu.text.Segmenter; +import com.ibm.icu.text.Segments; +import com.ibm.icu.util.ULocale; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class LocalizedSegmenterTest extends CoreTestFmwk { + + @Test + public void testLocaleInLocalizedSegmenter() { + String source = "k:a"; + + Object[][] casesData = { + {"en", Arrays.asList("k", ":", "a")}, + {"sv", Arrays.asList("k:a")} + }; + + for (Object[] caseDatum : casesData) { + String localeTag = (String) caseDatum[0]; + ULocale locale = ULocale.forLanguageTag(localeTag); + List expWords = (List) caseDatum[1]; + + Segmenter wordSeg = + LocalizedSegmenter.builder() + .setLocale(locale) + .setSegmentationType(SegmentationType.WORD) + .build(); + Segments segments = wordSeg.segment(source); + + List actWords = segments.subSequences().collect(Collectors.toList()); + + assertThat(actWords, is(expWords)); + } + } +} diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/RuleBasedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/RuleBasedSegmenterTest.java new file mode 100644 index 000000000000..1ccc0893d0bd --- /dev/null +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/RuleBasedSegmenterTest.java @@ -0,0 +1,5 @@ +package com.ibm.icu.dev.test.text; + +public class RuleBasedSegmenterTest { + +} diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java new file mode 100644 index 000000000000..e4a960eda19f --- /dev/null +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java @@ -0,0 +1,69 @@ +package com.ibm.icu.dev.test.text; + +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +import com.ibm.icu.dev.test.CoreTestFmwk; +import com.ibm.icu.text.LocalizedSegmenter; +import com.ibm.icu.text.LocalizedSegmenter.SegmentationType; +import com.ibm.icu.text.Segments; +import com.ibm.icu.util.ULocale; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.hamcrest.CoreMatchers; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class SegmentsTest extends CoreTestFmwk { + + @Test + public void testMultipleSegmentsFromSegmenter() { + LocalizedSegmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + String source2 = "Sphinx of black quartz, judge my vow."; + String source3 = "How vexingly quick daft zebras jump!"; + + List exp1 = Arrays.asList("The", " ", "quick", " ", "brown", " ", "fox", " ", + "jumped", " ", "over", " ", "the", " ", "lazy", " ", "dog", "."); + List exp2 = Arrays.asList("Sphinx", " ", "of", " ", "black", " ", "quartz", ",", + " ", "judge", " ", "my", " ", "vow", "."); + List exp3 = Arrays.asList("How", " ", "vexingly", " ", "quick", " ", "daft", " ", + "zebras", " ", "jump", "!"); + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + List act1 = segments1.subSequences().collect(Collectors.toList()); + assertThat(act1, is(exp1)); + + // Create new Segments for source2 + Segments segments2 = enWordSegmenter.segment(source2); + List act2 = segments2.subSequences().collect(Collectors.toList()); + assertThat(act2, is(exp2)); + + // Check that Segments for source1 is unaffected + act1 = segments1.subSequences().collect(Collectors.toList()); + assertThat(act1, is(exp1)); + + // Create new Segments for source3 + Segments segments3 = enWordSegmenter.segment(source3); + List act3 = segments3.subSequences().collect(Collectors.toList()); + assertThat(act3, is(exp3)); + + // Check that Segments for source1 is unaffected + act1 = segments1.subSequences().collect(Collectors.toList()); + assertThat(act1, is(exp1)); + + // Check that Segments for source2 is unaffected + act2 = segments2.subSequences().collect(Collectors.toList()); + assertThat(act2, is(exp2)); + } + +} From 858d7904ada8961145594416d5a92f5ec50041bf Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 8 Oct 2024 16:22:17 -0700 Subject: [PATCH 03/43] ICU-22789 Add an initial impl for RuleBasedSegmenter --- .../com/ibm/icu/text/RuleBasedSegmenter.java | 83 ++++++++++++++++++- .../dev/test/text/RuleBasedSegmenterTest.java | 46 +++++++++- 2 files changed, 127 insertions(+), 2 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java index e82392bdd1cc..406643fe5503 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java @@ -1,2 +1,83 @@ -package com.ibm.icu.text;public class RuleBasedSegmenter { +package com.ibm.icu.text; + +import java.util.stream.Stream; + +public class RuleBasedSegmenter implements Segmenter { + + private String rules; + + + @Override + public Segments segment(String s) { + return new RuleBasedSegments(s, this); + } + + public String getRules() { + return this.rules; + } + + public static Builder builder() { + return new Builder(); + } + + RuleBasedSegmenter(String rules) { + this.rules = rules; + } + + RuleBasedBreakIterator getBreakIterator() { + return new RuleBasedBreakIterator(this.rules); + } + + public static class Builder { + + String rules; + + Builder() { } + + public Builder setRules(String rules) { + this.rules = rules; + return this; + } + + public RuleBasedSegmenter build() { + return new RuleBasedSegmenter(this.rules); + } + } + + public static class RuleBasedSegments implements Segments { + private String source; + + private RuleBasedSegmenter segmenter; + + @Override + public Stream ranges() { + RuleBasedBreakIterator breakIter = this.segmenter.getBreakIterator(); + breakIter.setText(this.source); + + int start = breakIter.first(); + int limit = breakIter.next(); + if (limit == BreakIterator.DONE) { + return Stream.empty(); + } else { + Stream.Builder streamBuilder = Stream.builder(); + while (limit != BreakIterator.DONE) { + SegmentRange range = new SegmentRange(start, limit); + streamBuilder.add(range); + start = limit; + limit = breakIter.next(); + } + return streamBuilder.build(); + } + } + + RuleBasedSegments(String source, RuleBasedSegmenter segmenter) { + this.source = source; + this.segmenter = segmenter; + } + + @Override + public String getSourceString() { + return this.source; + } + } } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/RuleBasedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/RuleBasedSegmenterTest.java index 1ccc0893d0bd..34a702220f98 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/RuleBasedSegmenterTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/RuleBasedSegmenterTest.java @@ -1,5 +1,49 @@ package com.ibm.icu.dev.test.text; -public class RuleBasedSegmenterTest { +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; + +import com.ibm.icu.dev.test.CoreTestFmwk; +import com.ibm.icu.text.RuleBasedSegmenter; +import com.ibm.icu.text.Segmenter; +import com.ibm.icu.text.Segments; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RuleBasedSegmenterTest extends CoreTestFmwk { + + @Test + public void testRules() { + String source = "hejsan k:a tack"; + + Object[][] casesData = { + {"default", ".*;", Arrays.asList("hejsan k:a tack")}, + // TODO: add more cases once RBBI rule syntax is understood + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + String subrule = (String) caseDatum[1]; + List expWords = (List) caseDatum[2]; + + // the following rule substring was taken as a subset from BreakIteratorRules_en_US_TEST.java: + String rules = subrule; + + Segmenter seg = RuleBasedSegmenter.builder() + .setRules(rules) + .build(); + Segments segments = seg.segment(source); + + List actWords = segments.subSequences().collect(Collectors.toList()); + + assertThat(desc, actWords, is(expWords)); + } + + } } From 1313ff05fcf43e00525065b18e9c57a654c1a7e9 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 8 Oct 2024 16:22:25 -0700 Subject: [PATCH 04/43] ICU-22789 Fix typos --- .../ibm/icu/dev/test/rbbi/BreakIteratorRules_en_US_TEST.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/BreakIteratorRules_en_US_TEST.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/BreakIteratorRules_en_US_TEST.java index dc7b59d873ef..ab9de9cda46d 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/BreakIteratorRules_en_US_TEST.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/BreakIteratorRules_en_US_TEST.java @@ -46,7 +46,7 @@ public Object[][] getContents() { // all of which should not influence the algorithm "$_ignore_=[[:Mn:][:Me:][:Cf:]];" - // lower and upper case Roman letters, apostrophy and dash are + // lower and upper case Roman letters, apostrophe and dash are // in the English dictionary +"$_dictionary_=[a-zA-Z\\'\\-];" @@ -64,7 +64,7 @@ public Object[][] getContents() { +"$mid_word=[[:Pd:]\u00ad\u2027\\\"\\\'];" // punctuation that can occur in the middle of a number: currently - // apostrophes, qoutation marks, periods, commas, and the Arabic + // apostrophes, quotation marks, periods, commas, and the Arabic // decimal point +"$mid_num=[\\\"\\\'\\,\u066b\\.];" From aa4724a137b0f5e44694169d744042dbe366e87d Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 9 Oct 2024 15:36:59 -0700 Subject: [PATCH 05/43] ICU-22789 Refactor duplicate impl of ranges into default interface method --- .../com/ibm/icu/text/LocalizedSegmenter.java | 23 +++------------ .../com/ibm/icu/text/RuleBasedSegmenter.java | 29 +++++-------------- .../main/java/com/ibm/icu/text/Segmenter.java | 3 ++ .../main/java/com/ibm/icu/text/Segments.java | 23 ++++++++++++++- 4 files changed, 36 insertions(+), 42 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java index 7fb5397dfa51..6740457cd6c2 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java @@ -40,7 +40,8 @@ public static Builder builder() { this.segmentationType = segmentationType; } - BreakIterator getBreakIterator() { + @Override + public BreakIterator getNewBreakIterator() { BreakIterator breakIter; switch (this.segmentationType) { case LINE: @@ -101,24 +102,8 @@ public String getSourceString() { } @Override - public Stream ranges() { - BreakIterator breakIter = this.segmenter.getBreakIterator(); - breakIter.setText(this.source); - - int start = breakIter.first(); - int limit = breakIter.next(); - if (limit == BreakIterator.DONE) { - return Stream.empty(); - } else { - Stream.Builder streamBuilder = Stream.builder(); - while (limit != BreakIterator.DONE) { - SegmentRange range = new SegmentRange(start, limit); - streamBuilder.add(range); - start = limit; - limit = breakIter.next(); - } - return streamBuilder.build(); - } + public Segmenter getSegmenter() { + return segmenter; } } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java index 406643fe5503..e78fc04d4e9f 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java @@ -24,7 +24,8 @@ public static Builder builder() { this.rules = rules; } - RuleBasedBreakIterator getBreakIterator() { + @Override + public RuleBasedBreakIterator getNewBreakIterator() { return new RuleBasedBreakIterator(this.rules); } @@ -49,27 +50,6 @@ public static class RuleBasedSegments implements Segments { private RuleBasedSegmenter segmenter; - @Override - public Stream ranges() { - RuleBasedBreakIterator breakIter = this.segmenter.getBreakIterator(); - breakIter.setText(this.source); - - int start = breakIter.first(); - int limit = breakIter.next(); - if (limit == BreakIterator.DONE) { - return Stream.empty(); - } else { - Stream.Builder streamBuilder = Stream.builder(); - while (limit != BreakIterator.DONE) { - SegmentRange range = new SegmentRange(start, limit); - streamBuilder.add(range); - start = limit; - limit = breakIter.next(); - } - return streamBuilder.build(); - } - } - RuleBasedSegments(String source, RuleBasedSegmenter segmenter) { this.source = source; this.segmenter = segmenter; @@ -79,5 +59,10 @@ public Stream ranges() { public String getSourceString() { return this.source; } + + @Override + public Segmenter getSegmenter() { + return segmenter; + } } } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java index 8e1ad367167d..433a8f0202a0 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java @@ -2,4 +2,7 @@ public interface Segmenter { Segments segment(String s); + + @Deprecated + BreakIterator getNewBreakIterator(); } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java index e4922007b5df..fed0645d3ea0 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java @@ -6,11 +6,32 @@ public interface Segments { String getSourceString(); + @Deprecated + Segmenter getSegmenter(); + default Stream subSequences() { return ranges().map((range) -> getSourceString().subSequence(range.getStart(), range.getLimit())); } - Stream ranges(); + default Stream ranges() { + BreakIterator breakIter = getSegmenter().getNewBreakIterator(); + breakIter.setText(getSourceString()); + + int start = breakIter.first(); + int limit = breakIter.next(); + if (limit == BreakIterator.DONE) { + return Stream.empty(); + } else { + Stream.Builder streamBuilder = Stream.builder(); + while (limit != BreakIterator.DONE) { + SegmentRange range = new SegmentRange(start, limit); + streamBuilder.add(range); + start = limit; + limit = breakIter.next(); + } + return streamBuilder.build(); + } + }; class SegmentRange { int start; From 5defe25f920f52649e2ab5d1a625252977782947 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 9 Oct 2024 16:40:17 -0700 Subject: [PATCH 06/43] ICU-22789 Refactor `Segments` to only create BreakIterator once per string --- .../main/java/com/ibm/icu/text/LocalizedSegmenter.java | 8 ++++++++ .../main/java/com/ibm/icu/text/RuleBasedSegmenter.java | 8 ++++++++ .../core/src/main/java/com/ibm/icu/text/Segments.java | 5 ++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java index 6740457cd6c2..ea8e9ccea8cc 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java @@ -91,9 +91,12 @@ public static class LocalizedSegments implements Segments { private LocalizedSegmenter segmenter; + private BreakIterator breakIter; + private LocalizedSegments(String source, LocalizedSegmenter segmenter) { this.source = source; this.segmenter = segmenter; + this.breakIter = this.segmenter.getNewBreakIterator(); } @Override @@ -105,6 +108,11 @@ public String getSourceString() { public Segmenter getSegmenter() { return segmenter; } + + @Override + public BreakIterator getInstanceBreakIterator() { + return this.breakIter; + } } } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java index e78fc04d4e9f..901c5bab1e73 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java @@ -50,9 +50,12 @@ public static class RuleBasedSegments implements Segments { private RuleBasedSegmenter segmenter; + private BreakIterator breakIter; + RuleBasedSegments(String source, RuleBasedSegmenter segmenter) { this.source = source; this.segmenter = segmenter; + this.breakIter = this.segmenter.getNewBreakIterator(); } @Override @@ -64,5 +67,10 @@ public String getSourceString() { public Segmenter getSegmenter() { return segmenter; } + + @Override + public BreakIterator getInstanceBreakIterator() { + return this.breakIter; + } } } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java index fed0645d3ea0..1c9561abebb8 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java @@ -9,12 +9,15 @@ public interface Segments { @Deprecated Segmenter getSegmenter(); + @Deprecated + BreakIterator getInstanceBreakIterator(); + default Stream subSequences() { return ranges().map((range) -> getSourceString().subSequence(range.getStart(), range.getLimit())); } default Stream ranges() { - BreakIterator breakIter = getSegmenter().getNewBreakIterator(); + BreakIterator breakIter = getInstanceBreakIterator(); breakIter.setText(getSourceString()); int start = breakIter.first(); From 4d2f1d0a906ebfc1ad2d09d79765e3fa447ff3ec Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 29 Oct 2024 15:58:51 -0700 Subject: [PATCH 07/43] ICU-22789 Make the Segments.ranges() Stream lazy --- .../main/java/com/ibm/icu/text/Segments.java | 61 ++++++++++++++----- .../ibm/icu/dev/test/text/SegmentsTest.java | 24 +++++++- 2 files changed, 70 insertions(+), 15 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java index 1c9561abebb8..fa0c147d0880 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java @@ -1,6 +1,8 @@ package com.ibm.icu.text; +import java.util.Iterator; import java.util.stream.Stream; +import java.util.stream.StreamSupport; public interface Segments { @@ -20,20 +22,9 @@ default Stream ranges() { BreakIterator breakIter = getInstanceBreakIterator(); breakIter.setText(getSourceString()); - int start = breakIter.first(); - int limit = breakIter.next(); - if (limit == BreakIterator.DONE) { - return Stream.empty(); - } else { - Stream.Builder streamBuilder = Stream.builder(); - while (limit != BreakIterator.DONE) { - SegmentRange range = new SegmentRange(start, limit); - streamBuilder.add(range); - start = limit; - limit = breakIter.next(); - } - return streamBuilder.build(); - } + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + SegmentRangeIterable iterable = new SegmentRangeIterable(breakIter); + return StreamSupport.stream(iterable.spliterator(), false); }; class SegmentRange { @@ -54,4 +45,46 @@ public int getLimit(){ } } + /** + * This {@code Iterable} exists to enable the creation of a {@code Spliterator} that in turn + * enables the creation of a lazy {@code Stream}. + */ + class SegmentRangeIterable implements Iterable { + BreakIterator breakIter; + + SegmentRangeIterable(BreakIterator breakIter) { + this.breakIter = breakIter; + } + @Override + public Iterator iterator() { + return new SegmentRangeIterator(this.breakIter); + } + } + + class SegmentRangeIterator implements Iterator { + BreakIterator breakIter; + int start; + int limit; + + SegmentRangeIterator(BreakIterator breakIter) { + this.breakIter = breakIter; + this.start = breakIter.first(); + this.limit = breakIter.next(); + } + + @Override + public boolean hasNext() { + return this.limit != BreakIterator.DONE; + } + + @Override + public SegmentRange next() { + SegmentRange result = new SegmentRange(this.start, this.limit); + this.start = this.limit; + this.limit = this.breakIter.next(); + + return result; + } + + } } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java index e4a960eda19f..21d3ceefc346 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java @@ -7,11 +7,11 @@ import com.ibm.icu.text.LocalizedSegmenter; import com.ibm.icu.text.LocalizedSegmenter.SegmentationType; import com.ibm.icu.text.Segments; +import com.ibm.icu.text.Segments.SegmentRange; import com.ibm.icu.util.ULocale; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import org.hamcrest.CoreMatchers; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -19,6 +19,28 @@ @RunWith(JUnit4.class) public class SegmentsTest extends CoreTestFmwk { + @Test + public void testRangesFromSegmenter() { + LocalizedSegmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List ranges = segments1.ranges().collect(Collectors.toList()); + + assertEquals("first range start", 0, ranges.get(0).getStart()); + assertEquals("first range limit", 3, ranges.get(0).getLimit()); + + assertEquals("second range start", 3, ranges.get(1).getStart()); + assertEquals("second range limit", 4, ranges.get(1).getLimit()); + } + @Test public void testMultipleSegmentsFromSegmenter() { LocalizedSegmenter enWordSegmenter = From cf44d9fdf6e77716978b80b5c86951e2d18737ca Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 5 Nov 2024 15:27:44 -0800 Subject: [PATCH 08/43] Move the SegmentationType into Segmenter b/c it is not specific to LocalizedSegmenter --- .../main/java/com/ibm/icu/text/LocalizedSegmenter.java | 9 --------- .../core/src/main/java/com/ibm/icu/text/Segmenter.java | 9 +++++++++ .../ibm/icu/dev/test/text/LocalizedSegmenterTest.java | 2 +- .../java/com/ibm/icu/dev/test/text/SegmentsTest.java | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java index ea8e9ccea8cc..74b8af95b484 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java @@ -22,15 +22,6 @@ public SegmentationType getSegmentationType() { return this.segmentationType; } - public enum SegmentationType { - CHARACTER, - WORD, - LINE, - SENTENCE, - // TITLE, - // COUNT - } - public static Builder builder() { return new Builder(); } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java index 433a8f0202a0..1b7baa52e7df 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java @@ -5,4 +5,13 @@ public interface Segmenter { @Deprecated BreakIterator getNewBreakIterator(); + + public enum SegmentationType { + CHARACTER, + WORD, + LINE, + SENTENCE, + // TITLE, + // COUNT + } } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/LocalizedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/LocalizedSegmenterTest.java index 66eef38b563e..90efeea4c217 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/LocalizedSegmenterTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/LocalizedSegmenterTest.java @@ -5,7 +5,7 @@ import com.ibm.icu.dev.test.CoreTestFmwk; import com.ibm.icu.text.LocalizedSegmenter; -import com.ibm.icu.text.LocalizedSegmenter.SegmentationType; +import com.ibm.icu.text.Segmenter.SegmentationType; import com.ibm.icu.text.Segmenter; import com.ibm.icu.text.Segments; import com.ibm.icu.util.ULocale; diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java index 21d3ceefc346..87c6fc3ac1ab 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java @@ -5,7 +5,7 @@ import com.ibm.icu.dev.test.CoreTestFmwk; import com.ibm.icu.text.LocalizedSegmenter; -import com.ibm.icu.text.LocalizedSegmenter.SegmentationType; +import com.ibm.icu.text.Segmenter.SegmentationType; import com.ibm.icu.text.Segments; import com.ibm.icu.text.Segments.SegmentRange; import com.ibm.icu.util.ULocale; From 325d72a11c8f004d9594704fc9f764e8845b9766 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 6 Nov 2024 17:19:33 -0800 Subject: [PATCH 09/43] formatting typo --- icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java index fa0c147d0880..dd82a353d5b4 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java @@ -40,7 +40,7 @@ public int getStart() { return start; } - public int getLimit(){ + public int getLimit() { return limit; } } From f9a8dc269b657159b2bd9e195085cdbd991dcd92 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 15 Nov 2024 13:58:27 -0800 Subject: [PATCH 10/43] Rename enum value to match Unicode terminology for clarity purposes --- .../src/main/java/com/ibm/icu/text/LocalizedSegmenter.java | 5 ++--- .../main/core/src/main/java/com/ibm/icu/text/Segmenter.java | 4 +--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java index 74b8af95b484..f66d670e3a42 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java @@ -1,7 +1,6 @@ package com.ibm.icu.text; import com.ibm.icu.util.ULocale; -import java.util.stream.Stream; public class LocalizedSegmenter implements Segmenter { @@ -44,7 +43,7 @@ public BreakIterator getNewBreakIterator() { case WORD: breakIter = BreakIterator.getWordInstance(this.locale); break; - case CHARACTER: + case GRAPHEME_CLUSTER: default: breakIter = BreakIterator.getCharacterInstance(this.locale); break; @@ -56,7 +55,7 @@ public static class Builder { private ULocale locale = ULocale.ROOT; - private SegmentationType segmentationType = SegmentationType.CHARACTER; + private SegmentationType segmentationType = SegmentationType.GRAPHEME_CLUSTER; Builder() { } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java index 1b7baa52e7df..4f8543a37e09 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java @@ -7,11 +7,9 @@ public interface Segmenter { BreakIterator getNewBreakIterator(); public enum SegmentationType { - CHARACTER, + GRAPHEME_CLUSTER, WORD, LINE, SENTENCE, - // TITLE, - // COUNT } } From b88e32d1bb927f7d83df35f99ade8a44bf2bbde6 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Mon, 18 Nov 2024 17:33:44 -0800 Subject: [PATCH 11/43] Shorten names of inner classes to remove redundant prefix --- .../main/java/com/ibm/icu/text/Segments.java | 24 +++++++++---------- .../ibm/icu/dev/test/text/SegmentsTest.java | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java index dd82a353d5b4..cd254b587df3 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java @@ -18,20 +18,20 @@ default Stream subSequences() { return ranges().map((range) -> getSourceString().subSequence(range.getStart(), range.getLimit())); } - default Stream ranges() { + default Stream ranges() { BreakIterator breakIter = getInstanceBreakIterator(); breakIter.setText(getSourceString()); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager - SegmentRangeIterable iterable = new SegmentRangeIterable(breakIter); + RangeIterable iterable = new RangeIterable(breakIter); return StreamSupport.stream(iterable.spliterator(), false); }; - class SegmentRange { + class Range { int start; int limit; - public SegmentRange(int start, int limit) { + public Range(int start, int limit) { this.start = start; this.limit = limit; } @@ -49,24 +49,24 @@ public int getLimit() { * This {@code Iterable} exists to enable the creation of a {@code Spliterator} that in turn * enables the creation of a lazy {@code Stream}. */ - class SegmentRangeIterable implements Iterable { + class RangeIterable implements Iterable { BreakIterator breakIter; - SegmentRangeIterable(BreakIterator breakIter) { + RangeIterable(BreakIterator breakIter) { this.breakIter = breakIter; } @Override - public Iterator iterator() { - return new SegmentRangeIterator(this.breakIter); + public Iterator iterator() { + return new RangeIterator(this.breakIter); } } - class SegmentRangeIterator implements Iterator { + class RangeIterator implements Iterator { BreakIterator breakIter; int start; int limit; - SegmentRangeIterator(BreakIterator breakIter) { + RangeIterator(BreakIterator breakIter) { this.breakIter = breakIter; this.start = breakIter.first(); this.limit = breakIter.next(); @@ -78,8 +78,8 @@ public boolean hasNext() { } @Override - public SegmentRange next() { - SegmentRange result = new SegmentRange(this.start, this.limit); + public Range next() { + Range result = new Range(this.start, this.limit); this.start = this.limit; this.limit = this.breakIter.next(); diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java index 87c6fc3ac1ab..05b7224fffc2 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java @@ -7,7 +7,7 @@ import com.ibm.icu.text.LocalizedSegmenter; import com.ibm.icu.text.Segmenter.SegmentationType; import com.ibm.icu.text.Segments; -import com.ibm.icu.text.Segments.SegmentRange; +import com.ibm.icu.text.Segments.Range; import com.ibm.icu.util.ULocale; import java.util.Arrays; import java.util.List; @@ -32,7 +32,7 @@ public void testRangesFromSegmenter() { // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); - List ranges = segments1.ranges().collect(Collectors.toList()); + List ranges = segments1.ranges().collect(Collectors.toList()); assertEquals("first range start", 0, ranges.get(0).getStart()); assertEquals("first range limit", 3, ranges.get(0).getLimit()); From 652d49a7fd60bbc827454c5868f8a2ff9a459ad5 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 4 Dec 2024 15:27:37 -0800 Subject: [PATCH 12/43] ICU-22789 Create subpackage for Segmenter related classes --- .../icu/text/{ => segmenter}/LocalizedSegmenter.java | 3 ++- .../icu/text/{ => segmenter}/RuleBasedSegmenter.java | 5 +++-- .../com/ibm/icu/text/{ => segmenter}/Segmenter.java | 4 +++- .../com/ibm/icu/text/{ => segmenter}/Segments.java | 3 ++- .../text/{ => segmenter}/LocalizedSegmenterTest.java | 10 +++++----- .../text/{ => segmenter}/RuleBasedSegmenterTest.java | 8 ++++---- .../dev/test/text/{ => segmenter}/SegmentsTest.java | 10 +++++----- 7 files changed, 24 insertions(+), 19 deletions(-) rename icu4j/main/core/src/main/java/com/ibm/icu/text/{ => segmenter}/LocalizedSegmenter.java (96%) rename icu4j/main/core/src/main/java/com/ibm/icu/text/{ => segmenter}/RuleBasedSegmenter.java (91%) rename icu4j/main/core/src/main/java/com/ibm/icu/text/{ => segmenter}/Segmenter.java (73%) rename icu4j/main/core/src/main/java/com/ibm/icu/text/{ => segmenter}/Segments.java (96%) rename icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/{ => segmenter}/LocalizedSegmenterTest.java (83%) rename icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/{ => segmenter}/RuleBasedSegmenterTest.java (87%) rename icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/{ => segmenter}/SegmentsTest.java (92%) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java similarity index 96% rename from icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java rename to icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index f66d670e3a42..9b5af3a5918d 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -1,5 +1,6 @@ -package com.ibm.icu.text; +package com.ibm.icu.text.segmenter; +import com.ibm.icu.text.BreakIterator; import com.ibm.icu.util.ULocale; public class LocalizedSegmenter implements Segmenter { diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java similarity index 91% rename from icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java rename to icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index 901c5bab1e73..393ef0b3f5c5 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -1,6 +1,7 @@ -package com.ibm.icu.text; +package com.ibm.icu.text.segmenter; -import java.util.stream.Stream; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; public class RuleBasedSegmenter implements Segmenter { diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java similarity index 73% rename from icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java rename to icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java index 4f8543a37e09..4f1832986c5d 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java @@ -1,4 +1,6 @@ -package com.ibm.icu.text; +package com.ibm.icu.text.segmenter; + +import com.ibm.icu.text.BreakIterator; public interface Segmenter { Segments segment(String s); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java similarity index 96% rename from icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java rename to icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index cd254b587df3..4886b5020a6d 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -1,5 +1,6 @@ -package com.ibm.icu.text; +package com.ibm.icu.text.segmenter; +import com.ibm.icu.text.BreakIterator; import java.util.Iterator; import java.util.stream.Stream; import java.util.stream.StreamSupport; diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/LocalizedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java similarity index 83% rename from icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/LocalizedSegmenterTest.java rename to icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java index 90efeea4c217..477d8bca5d51 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/LocalizedSegmenterTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java @@ -1,13 +1,13 @@ -package com.ibm.icu.dev.test.text; +package com.ibm.icu.dev.test.text.segmenter; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; import com.ibm.icu.dev.test.CoreTestFmwk; -import com.ibm.icu.text.LocalizedSegmenter; -import com.ibm.icu.text.Segmenter.SegmentationType; -import com.ibm.icu.text.Segmenter; -import com.ibm.icu.text.Segments; +import com.ibm.icu.text.segmenter.LocalizedSegmenter; +import com.ibm.icu.text.segmenter.Segmenter.SegmentationType; +import com.ibm.icu.text.segmenter.Segmenter; +import com.ibm.icu.text.segmenter.Segments; import com.ibm.icu.util.ULocale; import java.util.Arrays; import java.util.List; diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/RuleBasedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java similarity index 87% rename from icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/RuleBasedSegmenterTest.java rename to icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java index 34a702220f98..5e46fe608038 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/RuleBasedSegmenterTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java @@ -1,12 +1,12 @@ -package com.ibm.icu.dev.test.text; +package com.ibm.icu.dev.test.text.segmenter; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; import com.ibm.icu.dev.test.CoreTestFmwk; -import com.ibm.icu.text.RuleBasedSegmenter; -import com.ibm.icu.text.Segmenter; -import com.ibm.icu.text.Segments; +import com.ibm.icu.text.segmenter.RuleBasedSegmenter; +import com.ibm.icu.text.segmenter.Segmenter; +import com.ibm.icu.text.segmenter.Segments; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java similarity index 92% rename from icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java rename to icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 05b7224fffc2..9c08d064d335 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -1,13 +1,13 @@ -package com.ibm.icu.dev.test.text; +package com.ibm.icu.dev.test.text.segmenter; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; import com.ibm.icu.dev.test.CoreTestFmwk; -import com.ibm.icu.text.LocalizedSegmenter; -import com.ibm.icu.text.Segmenter.SegmentationType; -import com.ibm.icu.text.Segments; -import com.ibm.icu.text.Segments.Range; +import com.ibm.icu.text.segmenter.LocalizedSegmenter; +import com.ibm.icu.text.segmenter.Segmenter.SegmentationType; +import com.ibm.icu.text.segmenter.Segments; +import com.ibm.icu.text.segmenter.Segments.Range; import com.ibm.icu.util.ULocale; import java.util.Arrays; import java.util.List; From 032cf040dcdfa42d3064592b4b4cc0ce7b20275a Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 4 Dec 2024 17:48:04 -0800 Subject: [PATCH 13/43] ICU-22789 Parameterize iteration by direction of iteration --- .../com/ibm/icu/text/segmenter/Segments.java | 59 ++++++++++++++++--- .../dev/test/text/segmenter/SegmentsTest.java | 27 ++++++++- 2 files changed, 75 insertions(+), 11 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 4886b5020a6d..8210d148be41 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -20,13 +20,17 @@ default Stream subSequences() { } default Stream ranges() { + return rangesAfterIndex(0); + }; + + default Stream rangesAfterIndex(int i) { BreakIterator breakIter = getInstanceBreakIterator(); breakIter.setText(getSourceString()); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager - RangeIterable iterable = new RangeIterable(breakIter); + RangeIterable iterable = new RangeIterable(breakIter, IterationDirection.FORWARDS, i); return StreamSupport.stream(iterable.spliterator(), false); - }; + } class Range { int start; @@ -46,31 +50,62 @@ public int getLimit() { } } + enum IterationDirection { + FORWARDS, + BACKWARDS, + } + /** * This {@code Iterable} exists to enable the creation of a {@code Spliterator} that in turn * enables the creation of a lazy {@code Stream}. */ class RangeIterable implements Iterable { BreakIterator breakIter; + IterationDirection direction; + int startIdx; - RangeIterable(BreakIterator breakIter) { + RangeIterable(BreakIterator breakIter, IterationDirection direction, int startIdx) { this.breakIter = breakIter; + this.direction = direction; + this.startIdx = startIdx; } @Override public Iterator iterator() { - return new RangeIterator(this.breakIter); + return new RangeIterator(this.breakIter, this.direction, this.startIdx); } } class RangeIterator implements Iterator { BreakIterator breakIter; + IterationDirection direction; + int startIdx; // remove this if not needed int start; int limit; - RangeIterator(BreakIterator breakIter) { + RangeIterator(BreakIterator breakIter, IterationDirection direction, int startIdx) { this.breakIter = breakIter; - this.start = breakIter.first(); - this.limit = breakIter.next(); + this.direction = direction; + + if (breakIter.isBoundary(startIdx)) { + this.start = breakIter.following(startIdx-1); + assert this.start == startIdx; + } else if (direction == IterationDirection.FORWARDS) { + this.start = breakIter.following(startIdx); + } else { + assert direction == IterationDirection.BACKWARDS; + this.start = breakIter.preceding(startIdx); + } + + this.limit = getDirectionBasedNext(); + } + + int getDirectionBasedNext() { + if (direction == IterationDirection.FORWARDS) { + return breakIter.next(); + } else { + assert direction == IterationDirection.BACKWARDS; + return breakIter.previous(); + } } @Override @@ -80,9 +115,15 @@ public boolean hasNext() { @Override public Range next() { - Range result = new Range(this.start, this.limit); + Range result; + if (this.limit < this.start) { + result = new Range(this.limit, this.start); + } else { + result = new Range(this.start, this.limit); + } + this.start = this.limit; - this.limit = this.breakIter.next(); + this.limit = getDirectionBasedNext(); return result; } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 9c08d064d335..b16ae25eb275 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -20,7 +20,7 @@ public class SegmentsTest extends CoreTestFmwk { @Test - public void testRangesFromSegmenter() { + public void testRanges() { LocalizedSegmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) @@ -42,7 +42,7 @@ public void testRangesFromSegmenter() { } @Test - public void testMultipleSegmentsFromSegmenter() { + public void testMultipleSegmentObjectsFromSegmenter() { LocalizedSegmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) @@ -88,4 +88,27 @@ public void testMultipleSegmentsFromSegmenter() { assertThat(act2, is(exp2)); } + @Test + public void testRangesAfterIndex() { + LocalizedSegmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 1; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List ranges = segments1.rangesAfterIndex(startIdx).collect(Collectors.toList()); + + assertEquals("first range start", 3, ranges.get(0).getStart()); + assertEquals("first range limit", 4, ranges.get(0).getLimit()); + + assertEquals("second range start", 4, ranges.get(1).getStart()); + assertEquals("second range limit", 9, ranges.get(1).getLimit()); + } + } From faafbcb2a73fa6a817dc646bd85e2d71ad4e1e72 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 4 Dec 2024 18:03:58 -0800 Subject: [PATCH 14/43] ICU-22789 Create a public `Function` to convert a range to a string --- .../src/main/java/com/ibm/icu/text/segmenter/Segments.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 8210d148be41..b012a89d52bc 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -2,6 +2,7 @@ import com.ibm.icu.text.BreakIterator; import java.util.Iterator; +import java.util.function.Function; import java.util.stream.Stream; import java.util.stream.StreamSupport; @@ -16,7 +17,7 @@ public interface Segments { BreakIterator getInstanceBreakIterator(); default Stream subSequences() { - return ranges().map((range) -> getSourceString().subSequence(range.getStart(), range.getLimit())); + return ranges().map(rangeToSequenceFn()); } default Stream ranges() { @@ -32,6 +33,10 @@ default Stream rangesAfterIndex(int i) { return StreamSupport.stream(iterable.spliterator(), false); } + default Function rangeToSequenceFn() { + return range -> getSourceString().subSequence(range.getStart(), range.getLimit()); + } + class Range { int start; int limit; From 3f62900bef0e0b600ca5446c368f124652eb4b25 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 4 Dec 2024 18:13:27 -0800 Subject: [PATCH 15/43] ICU-22789 Add API for backwards direction lazy Stream --- .../com/ibm/icu/text/segmenter/Segments.java | 15 ++++-- .../dev/test/text/segmenter/SegmentsTest.java | 46 +++++++++++++++++++ 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index b012a89d52bc..9da43e9718d2 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -33,6 +33,15 @@ default Stream rangesAfterIndex(int i) { return StreamSupport.stream(iterable.spliterator(), false); } + default Stream rangesBeforeIndex(int i) { + BreakIterator breakIter = getInstanceBreakIterator(); + breakIter.setText(getSourceString()); + + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + RangeIterable iterable = new RangeIterable(breakIter, IterationDirection.BACKWARDS, i); + return StreamSupport.stream(iterable.spliterator(), false); + } + default Function rangeToSequenceFn() { return range -> getSourceString().subSequence(range.getStart(), range.getLimit()); } @@ -101,10 +110,10 @@ class RangeIterator implements Iterator { this.start = breakIter.preceding(startIdx); } - this.limit = getDirectionBasedNext(); + this.limit = getDirectionBasedNextIdx(); } - int getDirectionBasedNext() { + int getDirectionBasedNextIdx() { if (direction == IterationDirection.FORWARDS) { return breakIter.next(); } else { @@ -128,7 +137,7 @@ public Range next() { } this.start = this.limit; - this.limit = getDirectionBasedNext(); + this.limit = getDirectionBasedNextIdx(); return result; } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index b16ae25eb275..f8c0c72f2cfb 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -111,4 +111,50 @@ public void testRangesAfterIndex() { assertEquals("second range limit", 9, ranges.get(1).getLimit()); } + @Test + public void testRangesBeforeIndex() { + LocalizedSegmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 9; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List ranges = segments1.rangesBeforeIndex(startIdx).collect(Collectors.toList()); + + assertEquals("first range start", 4, ranges.get(0).getStart()); + assertEquals("first range limit", 9, ranges.get(0).getLimit()); + + assertEquals("second range start", 3, ranges.get(1).getStart()); + assertEquals("second range limit", 4, ranges.get(1).getLimit()); + } + + @Test + public void testRangeToSequenceFn() { + LocalizedSegmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 9; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List exp1 = Arrays.asList("quick", " ", "The"); + + List act1 = segments1.rangesBeforeIndex(startIdx) + .map(segments1.rangeToSequenceFn()) + .collect(Collectors.toList()); + + assertThat(act1, is(exp1)); + } + } From 7c0c366d8e18239081cf6f5ec6decc0093b1e92c Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 4 Dec 2024 18:26:33 -0800 Subject: [PATCH 16/43] ICU-22789 Match BreakIterator behavior to always advance from start pos --- .../src/main/java/com/ibm/icu/text/segmenter/Segments.java | 7 ++----- .../com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 9da43e9718d2..cf3249d7678f 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -21,7 +21,7 @@ default Stream subSequences() { } default Stream ranges() { - return rangesAfterIndex(0); + return rangesAfterIndex(-1); }; default Stream rangesAfterIndex(int i) { @@ -100,10 +100,7 @@ class RangeIterator implements Iterator { this.breakIter = breakIter; this.direction = direction; - if (breakIter.isBoundary(startIdx)) { - this.start = breakIter.following(startIdx-1); - assert this.start == startIdx; - } else if (direction == IterationDirection.FORWARDS) { + if (direction == IterationDirection.FORWARDS) { this.start = breakIter.following(startIdx); } else { assert direction == IterationDirection.BACKWARDS; diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index f8c0c72f2cfb..4de1f44e154d 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -120,7 +120,7 @@ public void testRangesBeforeIndex() { .build(); String source1 = "The quick brown fox jumped over the lazy dog."; - int startIdx = 9; + int startIdx = 10; // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); @@ -143,7 +143,7 @@ public void testRangeToSequenceFn() { .build(); String source1 = "The quick brown fox jumped over the lazy dog."; - int startIdx = 9; + int startIdx = 10; // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); From 1f4b19a51efa897dc0ed9b37587f0add6e39fca6 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 10 Dec 2024 10:52:53 -0800 Subject: [PATCH 17/43] ICU-22789 Add `rangeAfterIndex` and `rangeBeforeIndex` --- .../com/ibm/icu/text/segmenter/Segments.java | 37 ++++++++ .../dev/test/text/segmenter/SegmentsTest.java | 84 +++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index cf3249d7678f..6f2d6184894f 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -42,6 +42,43 @@ default Stream rangesBeforeIndex(int i) { return StreamSupport.stream(iterable.spliterator(), false); } + default Range rangeAfterIndex(int i) { + BreakIterator breakIter = getInstanceBreakIterator(); + breakIter.setText(getSourceString()); + + int start = breakIter.following(i); + if (start == BreakIterator.DONE) { + return null; + } + + int limit = breakIter.next(); + + return new Range(start, limit); + } + + default Range rangeBeforeIndex(int i) { + BreakIterator breakIter = getInstanceBreakIterator(); + breakIter.setText(getSourceString()); + + + // TODO(ICU-22987): Remove after fixing preceding(int) to return `DONE` for negative inputs + if (i < 0) { + // return the same thing as we would if preceding() returned DONE + return null; + } + + int start = breakIter.preceding(i); + int limit = breakIter.previous(); + + if (start == BreakIterator.DONE || limit == BreakIterator.DONE) { + return null; + } + + assert limit <= start; + + return new Range(limit, start); + } + default Function rangeToSequenceFn() { return range -> getSourceString().subSequence(range.getStart(), range.getLimit()); } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 4de1f44e154d..bc00888df326 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -157,4 +157,88 @@ public void testRangeToSequenceFn() { assertThat(act1, is(exp1)); } + @Test + public void testRangeAfterIndex() { + LocalizedSegmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source = "The quick brown fox jumped over the lazy dog."; + + // Create new Segments for source1 + Segments segments = enWordSegmenter.segment(source); + + Object[][] casesData = { + {"index before beginning", -2, 0, 3}, + {"index at beginning", 0, 3, 4}, + {"index in the middle (end of first segment)", 3, 4, 9}, + {"index at the end", source.length()-1, source.length(), -1}, + {"index after the end", source.length()+1, null, null}, + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + int startIdx = (int) caseDatum[1]; + Integer expStart = (Integer) caseDatum[2]; + Integer expLimit = (Integer) caseDatum[3]; + + Range range = segments.rangeAfterIndex(startIdx); + + if (expStart == null) { + assert expLimit == null; + assertThat("Out of bounds range should be null", range == null); + } else { + assertEquals(desc + ", start", expStart.intValue(), range.getStart()); + assertEquals(desc + ", limit", expLimit.intValue(), range.getLimit()); + } + } + } + + + @Test + public void testRangeBeforeIndex() { + LocalizedSegmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source = "The quick brown fox jumped over the lazy dog."; + + // Create new Segments for source1 + Segments segments = enWordSegmenter.segment(source); + + Object[][] casesData = { + {"index before beginning", -2, null, null}, + {"index at beginning", 0, null, null}, + {"index in the middle of the first segment", 2, null, null}, + {"index in the middle of the third segment", 5, 3, 4}, + {"index at the end", source.length()-1, 40, 41}, + {"index after the end", source.length()+1, source.length()-1, source.length()}, + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + int startIdx = (int) caseDatum[1]; + Integer expStart = (Integer) caseDatum[2]; + Integer expLimit = (Integer) caseDatum[3]; + + Range range = segments.rangeBeforeIndex(startIdx); + + if (startIdx == -2) { + logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); + } + + if (expStart == null) { + assert expLimit == null; + assertThat("Out of bounds range should be null", range == null); + } else { + assertEquals(desc + ", start", expStart.intValue(), range.getStart()); + assertEquals(desc + ", limit", expLimit.intValue(), range.getLimit()); + } + } + } + } From 03582381cfc7887c5412dd794c608d89e9330633 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 11 Dec 2024 22:14:15 -0800 Subject: [PATCH 18/43] ICU-22789 Genericize source string type as CharSequence instead of String --- .../ibm/icu/text/segmenter/LocalizedSegmenter.java | 8 ++++---- .../ibm/icu/text/segmenter/RuleBasedSegmenter.java | 8 ++++---- .../java/com/ibm/icu/text/segmenter/Segmenter.java | 2 +- .../java/com/ibm/icu/text/segmenter/Segments.java | 12 ++++++------ 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index 9b5af3a5918d..44d67096125d 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -10,7 +10,7 @@ public class LocalizedSegmenter implements Segmenter { private SegmentationType segmentationType; @Override - public Segments segment(String s) { + public Segments segment(CharSequence s) { return new LocalizedSegments(s, this); } @@ -78,20 +78,20 @@ public LocalizedSegmenter build() { public static class LocalizedSegments implements Segments { - private String source; + private CharSequence source; private LocalizedSegmenter segmenter; private BreakIterator breakIter; - private LocalizedSegments(String source, LocalizedSegmenter segmenter) { + private LocalizedSegments(CharSequence source, LocalizedSegmenter segmenter) { this.source = source; this.segmenter = segmenter; this.breakIter = this.segmenter.getNewBreakIterator(); } @Override - public String getSourceString() { + public CharSequence getSourceSequence() { return source; } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index 393ef0b3f5c5..e69f4b89e278 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -9,7 +9,7 @@ public class RuleBasedSegmenter implements Segmenter { @Override - public Segments segment(String s) { + public Segments segment(CharSequence s) { return new RuleBasedSegments(s, this); } @@ -47,20 +47,20 @@ public RuleBasedSegmenter build() { } public static class RuleBasedSegments implements Segments { - private String source; + private CharSequence source; private RuleBasedSegmenter segmenter; private BreakIterator breakIter; - RuleBasedSegments(String source, RuleBasedSegmenter segmenter) { + RuleBasedSegments(CharSequence source, RuleBasedSegmenter segmenter) { this.source = source; this.segmenter = segmenter; this.breakIter = this.segmenter.getNewBreakIterator(); } @Override - public String getSourceString() { + public CharSequence getSourceSequence() { return this.source; } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java index 4f1832986c5d..d38fa4609950 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java @@ -3,7 +3,7 @@ import com.ibm.icu.text.BreakIterator; public interface Segmenter { - Segments segment(String s); + Segments segment(CharSequence s); @Deprecated BreakIterator getNewBreakIterator(); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 6f2d6184894f..2bc1ef1e3eca 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -8,7 +8,7 @@ public interface Segments { - String getSourceString(); + CharSequence getSourceSequence(); @Deprecated Segmenter getSegmenter(); @@ -26,7 +26,7 @@ default Stream ranges() { default Stream rangesAfterIndex(int i) { BreakIterator breakIter = getInstanceBreakIterator(); - breakIter.setText(getSourceString()); + breakIter.setText(getSourceSequence()); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager RangeIterable iterable = new RangeIterable(breakIter, IterationDirection.FORWARDS, i); @@ -35,7 +35,7 @@ default Stream rangesAfterIndex(int i) { default Stream rangesBeforeIndex(int i) { BreakIterator breakIter = getInstanceBreakIterator(); - breakIter.setText(getSourceString()); + breakIter.setText(getSourceSequence()); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager RangeIterable iterable = new RangeIterable(breakIter, IterationDirection.BACKWARDS, i); @@ -44,7 +44,7 @@ default Stream rangesBeforeIndex(int i) { default Range rangeAfterIndex(int i) { BreakIterator breakIter = getInstanceBreakIterator(); - breakIter.setText(getSourceString()); + breakIter.setText(getSourceSequence()); int start = breakIter.following(i); if (start == BreakIterator.DONE) { @@ -58,7 +58,7 @@ default Range rangeAfterIndex(int i) { default Range rangeBeforeIndex(int i) { BreakIterator breakIter = getInstanceBreakIterator(); - breakIter.setText(getSourceString()); + breakIter.setText(getSourceSequence()); // TODO(ICU-22987): Remove after fixing preceding(int) to return `DONE` for negative inputs @@ -80,7 +80,7 @@ default Range rangeBeforeIndex(int i) { } default Function rangeToSequenceFn() { - return range -> getSourceString().subSequence(range.getStart(), range.getLimit()); + return range -> getSourceSequence().subSequence(range.getStart(), range.getLimit()); } class Range { From a57730f5a5feb122eb561a09bc7ce9464c61b928 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 11 Dec 2024 22:32:07 -0800 Subject: [PATCH 19/43] ICU-22789 Promote `int` to `long` for ICU `assertEquals` test compare method --- .../com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index bc00888df326..9008568f17a9 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -190,8 +190,8 @@ public void testRangeAfterIndex() { assert expLimit == null; assertThat("Out of bounds range should be null", range == null); } else { - assertEquals(desc + ", start", expStart.intValue(), range.getStart()); - assertEquals(desc + ", limit", expLimit.intValue(), range.getLimit()); + assertEquals(desc + ", start", (long) expStart.intValue(), range.getStart()); + assertEquals(desc + ", limit", (long) expLimit.intValue(), range.getLimit()); } } } @@ -235,8 +235,8 @@ public void testRangeBeforeIndex() { assert expLimit == null; assertThat("Out of bounds range should be null", range == null); } else { - assertEquals(desc + ", start", expStart.intValue(), range.getStart()); - assertEquals(desc + ", limit", expLimit.intValue(), range.getLimit()); + assertEquals(desc + ", start", (long) expStart.intValue(), (long) range.getStart()); + assertEquals(desc + ", limit", (long) expLimit.intValue(), (long) range.getLimit()); } } } From e20fa355ccc407ba0f2110d9ccf230759b9e576f Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 11 Dec 2024 22:51:03 -0800 Subject: [PATCH 20/43] ICU-22789 Fix bug for next range when only `limit`==`DONE` but not `start` --- .../main/java/com/ibm/icu/text/segmenter/Segments.java | 3 +++ .../ibm/icu/dev/test/text/segmenter/SegmentsTest.java | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 2bc1ef1e3eca..3941700ce973 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -52,6 +52,9 @@ default Range rangeAfterIndex(int i) { } int limit = breakIter.next(); + if (limit == BreakIterator.DONE) { + return null; + } return new Range(start, limit); } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 9008568f17a9..674bf1d58c8b 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -171,11 +171,11 @@ public void testRangeAfterIndex() { Segments segments = enWordSegmenter.segment(source); Object[][] casesData = { - {"index before beginning", -2, 0, 3}, - {"index at beginning", 0, 3, 4}, - {"index in the middle (end of first segment)", 3, 4, 9}, - {"index at the end", source.length()-1, source.length(), -1}, - {"index after the end", source.length()+1, null, null}, + {"index before beginning", -2, 0, 3}, + {"index at beginning", 0, 3, 4}, + {"index in the middle (end of first segment)", 3, 4, 9}, + {"index at the end", source.length()-1, null, null}, + {"index after the end", source.length()+1, null, null}, }; for (Object[] caseDatum : casesData) { From 03f1e07164ec7280c4c3483524a4cc2d5a7fa0cc Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 11 Dec 2024 23:38:25 -0800 Subject: [PATCH 21/43] ICU-22789 Add APIs for IntStream of boundary indices --- .../com/ibm/icu/text/segmenter/Segments.java | 109 +++++++++++++++++- .../dev/test/text/segmenter/SegmentsTest.java | 88 +++++++++++++- 2 files changed, 189 insertions(+), 8 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 3941700ce973..dd045a51aaca 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -3,6 +3,7 @@ import com.ibm.icu.text.BreakIterator; import java.util.Iterator; import java.util.function.Function; +import java.util.stream.IntStream; import java.util.stream.Stream; import java.util.stream.StreamSupport; @@ -86,6 +87,43 @@ default Function rangeToSequenceFn() { return range -> getSourceSequence().subSequence(range.getStart(), range.getLimit()); } + default IntStream boundaries() { + return boundariesAfterIndex(-1); + } + + default IntStream boundariesAfterIndex(int i) { + BreakIterator breakIter = getInstanceBreakIterator(); + breakIter.setText(getSourceSequence()); + + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.FORWARDS, i); + Stream boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false); + return boundariesAsIntegers.mapToInt(Integer::intValue); + } + + default IntStream boundariesBeforeIndex(int i) { + BreakIterator breakIter = getInstanceBreakIterator(); + breakIter.setText(getSourceSequence()); + + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.BACKWARDS, i); + Stream boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false); + return boundariesAsIntegers.mapToInt(Integer::intValue); + } + + // + // Inner enums/classes in common for other inner classes + // + + enum IterationDirection { + FORWARDS, + BACKWARDS, + } + + // + // Inner classes for Range, RangeIterable, and RangeIterator + // + class Range { int start; int limit; @@ -104,11 +142,6 @@ public int getLimit() { } } - enum IterationDirection { - FORWARDS, - BACKWARDS, - } - /** * This {@code Iterable} exists to enable the creation of a {@code Spliterator} that in turn * enables the creation of a lazy {@code Stream}. @@ -123,6 +156,7 @@ class RangeIterable implements Iterable { this.direction = direction; this.startIdx = startIdx; } + @Override public Iterator iterator() { return new RangeIterator(this.breakIter, this.direction, this.startIdx); @@ -132,7 +166,6 @@ public Iterator iterator() { class RangeIterator implements Iterator { BreakIterator breakIter; IterationDirection direction; - int startIdx; // remove this if not needed int start; int limit; @@ -178,6 +211,70 @@ public Range next() { return result; } + } + + // + // Inner classes for BoundaryIterable and BoundaryIterator + // + + class BoundaryIterable implements Iterable { + BreakIterator breakIter; + IterationDirection direction; + int startIdx; + + BoundaryIterable(BreakIterator breakIter, IterationDirection direction, int startIdx) { + this.breakIter = breakIter; + this.direction = direction; + this.startIdx = startIdx; + } + @Override + public Iterator iterator() { + return new BoundaryIterator(this.breakIter, this.direction, this.startIdx); + } } + + class BoundaryIterator implements Iterator { + BreakIterator breakIter; + IterationDirection direction; + int currIdx; + + BoundaryIterator(BreakIterator breakIter, IterationDirection direction, int startIdx) { + this.breakIter = breakIter; + this.direction = direction; + + // TODO(ICU-22987): Remove after fixing preceding(int) to return `DONE` for negative inputs + if (startIdx < 0 && direction == IterationDirection.BACKWARDS) { + this.currIdx = BreakIterator.DONE; + return; + } + + if (direction == IterationDirection.FORWARDS) { + this.currIdx = breakIter.following(startIdx); + } else { + assert direction == IterationDirection.BACKWARDS; + this.currIdx = breakIter.preceding(startIdx); + } + } + + @Override + public boolean hasNext() { + return this.currIdx != BreakIterator.DONE; + } + + @Override + public Integer next() { + int result = this.currIdx; + + if (direction == IterationDirection.FORWARDS) { + this.currIdx = breakIter.next(); + } else { + assert direction == IterationDirection.BACKWARDS; + this.currIdx = breakIter.previous(); + } + + return result; + } + } + } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 674bf1d58c8b..2b830eeb9560 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -167,7 +167,7 @@ public void testRangeAfterIndex() { String source = "The quick brown fox jumped over the lazy dog."; - // Create new Segments for source1 + // Create new Segments for source Segments segments = enWordSegmenter.segment(source); Object[][] casesData = { @@ -207,7 +207,7 @@ public void testRangeBeforeIndex() { String source = "The quick brown fox jumped over the lazy dog."; - // Create new Segments for source1 + // Create new Segments for source Segments segments = enWordSegmenter.segment(source); Object[][] casesData = { @@ -241,4 +241,88 @@ public void testRangeBeforeIndex() { } } + @Test + public void testBoundaries() { + LocalizedSegmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source = "The quick brown fox jumped over the lazy dog."; + + // Create new Segments for source + Segments segments = enWordSegmenter.segment(source); + + int[] exp = {0, 3, 4, 9, 10, 15, 16, 19, 20, 26, 27, 31, 32, 35, 36, 40, 41, 44, 45}; + + int[] act = segments.boundaries().toArray(); + + assertThat(act, is(exp)); + } + + @Test + public void testBoundariesAfterIndex() { + LocalizedSegmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source = "The quick brown fox jumped over the lazy dog."; + int TAKE_LIMIT = 5; + + // Create new Segments for source + Segments segments = enWordSegmenter.segment(source); + + Object[][] casesData = { + {"first " + TAKE_LIMIT + " before beginning", -2, new int[]{0, 3, 4, 9, 10}}, + {"first " + TAKE_LIMIT + " in the middle of the third segment", 5, new int[]{9, 10, 15, 16, 19}}, + {"first " + TAKE_LIMIT + " at the end", source.length(), new int[0]}, + {"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[0]}, + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + int startIdx = (int) caseDatum[1]; + int[] exp = (int[]) caseDatum[2]; + + int[] act = segments.boundariesAfterIndex(startIdx).limit(TAKE_LIMIT).toArray(); + + assertThat(act, is(exp)); + } + } + + @Test + public void testBoundariesBeforeIndex() { + LocalizedSegmenter enWordSegmenter = + LocalizedSegmenter.builder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source = "The quick brown fox jumped over the lazy dog."; + int TAKE_LIMIT = 5; + + // Create new Segments for source + Segments segments = enWordSegmenter.segment(source); + + Object[][] casesData = { + {"first " + TAKE_LIMIT + " before beginning", -2, new int[0]}, + {"first " + TAKE_LIMIT + " at the beginning", 0, new int[0]}, + {"first " + TAKE_LIMIT + " in the middle of the 2nd to last", 42, new int[]{41, 40, 36, 35, 32}}, + {"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[]{45, 44, 41, 40, 36}}, + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + int startIdx = (int) caseDatum[1]; + int[] exp = (int[]) caseDatum[2]; + + int[] act = segments.boundariesBeforeIndex(startIdx).limit(TAKE_LIMIT).toArray(); + + assertThat(act, is(exp)); + } + } + } From 7f5dbd0ba047b4eafcf6203af46d0a98f94ead44 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Thu, 12 Dec 2024 07:26:00 -0800 Subject: [PATCH 22/43] ICU-22789 Add logKnownIssue in test for other API where it pertains --- .../com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 2b830eeb9560..711bbe7daaf3 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -322,6 +322,10 @@ public void testBoundariesBeforeIndex() { int[] act = segments.boundariesBeforeIndex(startIdx).limit(TAKE_LIMIT).toArray(); assertThat(act, is(exp)); + + if (startIdx == -2) { + logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); + } } } From 55226ac5691c8e7f987f6a037d3e8dca3da87c90 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Thu, 12 Dec 2024 07:27:42 -0800 Subject: [PATCH 23/43] ICU-22789 Minor formatting typo --- .../java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 711bbe7daaf3..525dd31e1140 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -278,7 +278,7 @@ public void testBoundariesAfterIndex() { Object[][] casesData = { {"first " + TAKE_LIMIT + " before beginning", -2, new int[]{0, 3, 4, 9, 10}}, {"first " + TAKE_LIMIT + " in the middle of the third segment", 5, new int[]{9, 10, 15, 16, 19}}, - {"first " + TAKE_LIMIT + " at the end", source.length(), new int[0]}, + {"first " + TAKE_LIMIT + " at the end", source.length(), new int[0]}, {"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[0]}, }; From 60ea6f3f3b714b727096643ae1857d1b7586d50a Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 13 Dec 2024 14:14:56 -0800 Subject: [PATCH 24/43] ICU-22789 Refactor `Range` into `Segment` --- .../com/ibm/icu/text/segmenter/Segments.java | 59 ++++++++----------- .../dev/test/text/segmenter/SegmentsTest.java | 48 +++++++-------- 2 files changed, 50 insertions(+), 57 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index dd045a51aaca..388aa5e4292b 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -21,29 +21,29 @@ default Stream subSequences() { return ranges().map(rangeToSequenceFn()); } - default Stream ranges() { + default Stream ranges() { return rangesAfterIndex(-1); }; - default Stream rangesAfterIndex(int i) { + default Stream rangesAfterIndex(int i) { BreakIterator breakIter = getInstanceBreakIterator(); breakIter.setText(getSourceSequence()); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager - RangeIterable iterable = new RangeIterable(breakIter, IterationDirection.FORWARDS, i); + SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.FORWARDS, i); return StreamSupport.stream(iterable.spliterator(), false); } - default Stream rangesBeforeIndex(int i) { + default Stream rangesBeforeIndex(int i) { BreakIterator breakIter = getInstanceBreakIterator(); breakIter.setText(getSourceSequence()); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager - RangeIterable iterable = new RangeIterable(breakIter, IterationDirection.BACKWARDS, i); + SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.BACKWARDS, i); return StreamSupport.stream(iterable.spliterator(), false); } - default Range rangeAfterIndex(int i) { + default Segment rangeAfterIndex(int i) { BreakIterator breakIter = getInstanceBreakIterator(); breakIter.setText(getSourceSequence()); @@ -57,10 +57,10 @@ default Range rangeAfterIndex(int i) { return null; } - return new Range(start, limit); + return new Segment(start, limit); } - default Range rangeBeforeIndex(int i) { + default Segment rangeBeforeIndex(int i) { BreakIterator breakIter = getInstanceBreakIterator(); breakIter.setText(getSourceSequence()); @@ -80,11 +80,11 @@ default Range rangeBeforeIndex(int i) { assert limit <= start; - return new Range(limit, start); + return new Segment(limit, start); } - default Function rangeToSequenceFn() { - return range -> getSourceSequence().subSequence(range.getStart(), range.getLimit()); + default Function rangeToSequenceFn() { + return segment -> getSourceSequence().subSequence(segment.start(), segment.limit()); } default IntStream boundaries() { @@ -124,52 +124,45 @@ enum IterationDirection { // Inner classes for Range, RangeIterable, and RangeIterator // - class Range { - int start; - int limit; + class Segment { + public final int start; + public final int limit; + public final int ruleStatus = 0; - public Range(int start, int limit) { + public Segment(int start, int limit) { this.start = start; this.limit = limit; } - - public int getStart() { - return start; - } - - public int getLimit() { - return limit; - } } /** * This {@code Iterable} exists to enable the creation of a {@code Spliterator} that in turn * enables the creation of a lazy {@code Stream}. */ - class RangeIterable implements Iterable { + class SegmentIterable implements Iterable { BreakIterator breakIter; IterationDirection direction; int startIdx; - RangeIterable(BreakIterator breakIter, IterationDirection direction, int startIdx) { + SegmentIterable(BreakIterator breakIter, IterationDirection direction, int startIdx) { this.breakIter = breakIter; this.direction = direction; this.startIdx = startIdx; } @Override - public Iterator iterator() { - return new RangeIterator(this.breakIter, this.direction, this.startIdx); + public Iterator iterator() { + return new SegmentIterator(this.breakIter, this.direction, this.startIdx); } } - class RangeIterator implements Iterator { + class SegmentIterator implements Iterator { BreakIterator breakIter; IterationDirection direction; int start; int limit; - RangeIterator(BreakIterator breakIter, IterationDirection direction, int startIdx) { + SegmentIterator(BreakIterator breakIter, IterationDirection direction, int startIdx) { this.breakIter = breakIter; this.direction = direction; @@ -198,12 +191,12 @@ public boolean hasNext() { } @Override - public Range next() { - Range result; + public Segment next() { + Segment result; if (this.limit < this.start) { - result = new Range(this.limit, this.start); + result = new Segment(this.limit, this.start); } else { - result = new Range(this.start, this.limit); + result = new Segment(this.start, this.limit); } this.start = this.limit; diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 525dd31e1140..24d1e6c2f4c7 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -7,7 +7,7 @@ import com.ibm.icu.text.segmenter.LocalizedSegmenter; import com.ibm.icu.text.segmenter.Segmenter.SegmentationType; import com.ibm.icu.text.segmenter.Segments; -import com.ibm.icu.text.segmenter.Segments.Range; +import com.ibm.icu.text.segmenter.Segments.Segment; import com.ibm.icu.util.ULocale; import java.util.Arrays; import java.util.List; @@ -32,13 +32,13 @@ public void testRanges() { // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); - List ranges = segments1.ranges().collect(Collectors.toList()); + List segments = segments1.ranges().collect(Collectors.toList()); - assertEquals("first range start", 0, ranges.get(0).getStart()); - assertEquals("first range limit", 3, ranges.get(0).getLimit()); + assertEquals("first range start", 0, segments.get(0).start()); + assertEquals("first range limit", 3, segments.get(0).limit()); - assertEquals("second range start", 3, ranges.get(1).getStart()); - assertEquals("second range limit", 4, ranges.get(1).getLimit()); + assertEquals("second range start", 3, segments.get(1).start()); + assertEquals("second range limit", 4, segments.get(1).limit()); } @Test @@ -102,13 +102,13 @@ public void testRangesAfterIndex() { // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); - List ranges = segments1.rangesAfterIndex(startIdx).collect(Collectors.toList()); + List segments = segments1.rangesAfterIndex(startIdx).collect(Collectors.toList()); - assertEquals("first range start", 3, ranges.get(0).getStart()); - assertEquals("first range limit", 4, ranges.get(0).getLimit()); + assertEquals("first range start", 3, segments.get(0).start()); + assertEquals("first range limit", 4, segments.get(0).limit()); - assertEquals("second range start", 4, ranges.get(1).getStart()); - assertEquals("second range limit", 9, ranges.get(1).getLimit()); + assertEquals("second range start", 4, segments.get(1).start()); + assertEquals("second range limit", 9, segments.get(1).limit()); } @Test @@ -125,13 +125,13 @@ public void testRangesBeforeIndex() { // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); - List ranges = segments1.rangesBeforeIndex(startIdx).collect(Collectors.toList()); + List segments = segments1.rangesBeforeIndex(startIdx).collect(Collectors.toList()); - assertEquals("first range start", 4, ranges.get(0).getStart()); - assertEquals("first range limit", 9, ranges.get(0).getLimit()); + assertEquals("first range start", 4, segments.get(0).start()); + assertEquals("first range limit", 9, segments.get(0).limit()); - assertEquals("second range start", 3, ranges.get(1).getStart()); - assertEquals("second range limit", 4, ranges.get(1).getLimit()); + assertEquals("second range start", 3, segments.get(1).start()); + assertEquals("second range limit", 4, segments.get(1).limit()); } @Test @@ -184,14 +184,14 @@ public void testRangeAfterIndex() { Integer expStart = (Integer) caseDatum[2]; Integer expLimit = (Integer) caseDatum[3]; - Range range = segments.rangeAfterIndex(startIdx); + Segment segment = segments.rangeAfterIndex(startIdx); if (expStart == null) { assert expLimit == null; - assertThat("Out of bounds range should be null", range == null); + assertThat("Out of bounds range should be null", segment == null); } else { - assertEquals(desc + ", start", (long) expStart.intValue(), range.getStart()); - assertEquals(desc + ", limit", (long) expLimit.intValue(), range.getLimit()); + assertEquals(desc + ", start", (long) expStart.intValue(), segment.start()); + assertEquals(desc + ", limit", (long) expLimit.intValue(), segment.limit()); } } } @@ -225,7 +225,7 @@ public void testRangeBeforeIndex() { Integer expStart = (Integer) caseDatum[2]; Integer expLimit = (Integer) caseDatum[3]; - Range range = segments.rangeBeforeIndex(startIdx); + Segment segment = segments.rangeBeforeIndex(startIdx); if (startIdx == -2) { logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); @@ -233,10 +233,10 @@ public void testRangeBeforeIndex() { if (expStart == null) { assert expLimit == null; - assertThat("Out of bounds range should be null", range == null); + assertThat("Out of bounds range should be null", segment == null); } else { - assertEquals(desc + ", start", (long) expStart.intValue(), (long) range.getStart()); - assertEquals(desc + ", limit", (long) expLimit.intValue(), (long) range.getLimit()); + assertEquals(desc + ", start", (long) expStart.intValue(), (long) segment.start()); + assertEquals(desc + ", limit", (long) expLimit.intValue(), (long) segment.limit()); } } } From 94ec357df5ccaebfd1a305a54528f943f2024c98 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 31 Dec 2024 14:43:31 -0800 Subject: [PATCH 25/43] ICU-22789 Make followup adjustments to Segment field accessors after refactor --- .../com/ibm/icu/text/segmenter/Segments.java | 2 +- .../dev/test/text/segmenter/SegmentsTest.java | 32 +++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 388aa5e4292b..66897108252c 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -84,7 +84,7 @@ default Segment rangeBeforeIndex(int i) { } default Function rangeToSequenceFn() { - return segment -> getSourceSequence().subSequence(segment.start(), segment.limit()); + return segment -> getSourceSequence().subSequence(segment.start, segment.limit); } default IntStream boundaries() { diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 24d1e6c2f4c7..74581e787b5f 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -34,11 +34,11 @@ public void testRanges() { List segments = segments1.ranges().collect(Collectors.toList()); - assertEquals("first range start", 0, segments.get(0).start()); - assertEquals("first range limit", 3, segments.get(0).limit()); + assertEquals("first range start", 0, segments.get(0).start); + assertEquals("first range limit", 3, segments.get(0).limit); - assertEquals("second range start", 3, segments.get(1).start()); - assertEquals("second range limit", 4, segments.get(1).limit()); + assertEquals("second range start", 3, segments.get(1).start); + assertEquals("second range limit", 4, segments.get(1).limit); } @Test @@ -104,11 +104,11 @@ public void testRangesAfterIndex() { List segments = segments1.rangesAfterIndex(startIdx).collect(Collectors.toList()); - assertEquals("first range start", 3, segments.get(0).start()); - assertEquals("first range limit", 4, segments.get(0).limit()); + assertEquals("first range start", 3, segments.get(0).start); + assertEquals("first range limit", 4, segments.get(0).limit); - assertEquals("second range start", 4, segments.get(1).start()); - assertEquals("second range limit", 9, segments.get(1).limit()); + assertEquals("second range start", 4, segments.get(1).start); + assertEquals("second range limit", 9, segments.get(1).limit); } @Test @@ -127,11 +127,11 @@ public void testRangesBeforeIndex() { List segments = segments1.rangesBeforeIndex(startIdx).collect(Collectors.toList()); - assertEquals("first range start", 4, segments.get(0).start()); - assertEquals("first range limit", 9, segments.get(0).limit()); + assertEquals("first range start", 4, segments.get(0).start); + assertEquals("first range limit", 9, segments.get(0).limit); - assertEquals("second range start", 3, segments.get(1).start()); - assertEquals("second range limit", 4, segments.get(1).limit()); + assertEquals("second range start", 3, segments.get(1).start); + assertEquals("second range limit", 4, segments.get(1).limit); } @Test @@ -190,8 +190,8 @@ public void testRangeAfterIndex() { assert expLimit == null; assertThat("Out of bounds range should be null", segment == null); } else { - assertEquals(desc + ", start", (long) expStart.intValue(), segment.start()); - assertEquals(desc + ", limit", (long) expLimit.intValue(), segment.limit()); + assertEquals(desc + ", start", (long) expStart.intValue(), segment.start); + assertEquals(desc + ", limit", (long) expLimit.intValue(), segment.limit); } } } @@ -235,8 +235,8 @@ public void testRangeBeforeIndex() { assert expLimit == null; assertThat("Out of bounds range should be null", segment == null); } else { - assertEquals(desc + ", start", (long) expStart.intValue(), (long) segment.start()); - assertEquals(desc + ", limit", (long) expLimit.intValue(), (long) segment.limit()); + assertEquals(desc + ", start", (long) expStart.intValue(), (long) segment.start); + assertEquals(desc + ", limit", (long) expLimit.intValue(), (long) segment.limit); } } } From 5b6eaddcedd6fa85811461d70f6211f397d5a18b Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 31 Dec 2024 15:29:50 -0800 Subject: [PATCH 26/43] ICU-22789 Refactor default impls of `Segments` interface into reusable static util fns for concrete classes --- .../text/segmenter/LocalizedSegmenter.java | 55 +++++++++- .../text/segmenter/RuleBasedSegmenter.java | 51 ++++++++- .../com/ibm/icu/text/segmenter/Segments.java | 100 ++--------------- .../icu/text/segmenter/SegmentsImplUtils.java | 103 ++++++++++++++++++ 4 files changed, 209 insertions(+), 100 deletions(-) create mode 100644 icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index 44d67096125d..0ffb80940370 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -2,6 +2,9 @@ import com.ibm.icu.text.BreakIterator; import com.ibm.icu.util.ULocale; +import java.util.function.Function; +import java.util.stream.IntStream; +import java.util.stream.Stream; public class LocalizedSegmenter implements Segmenter { @@ -76,7 +79,7 @@ public LocalizedSegmenter build() { } - public static class LocalizedSegments implements Segments { + public class LocalizedSegments implements Segments { private CharSequence source; @@ -92,17 +95,57 @@ private LocalizedSegments(CharSequence source, LocalizedSegmenter segmenter) { @Override public CharSequence getSourceSequence() { - return source; + return this.source; } @Override - public Segmenter getSegmenter() { - return segmenter; + public Stream subSequences() { + return SegmentsImplUtils.subSequences(this.breakIter, this.source); } @Override - public BreakIterator getInstanceBreakIterator() { - return this.breakIter; + public Stream ranges() { + return SegmentsImplUtils.ranges(this.breakIter, this.source); + } + + @Override + public Stream rangesAfterIndex(int i) { + return SegmentsImplUtils.rangesAfterIndex(this.breakIter, this.source, i); + } + + @Override + public Stream rangesBeforeIndex(int i) { + return SegmentsImplUtils.rangesBeforeIndex(this.breakIter, this.source, i); + } + + @Override + public Segment rangeAfterIndex(int i) { + return SegmentsImplUtils.rangeAfterIndex(this.breakIter, this.source, i); + } + + @Override + public Segment rangeBeforeIndex(int i) { + return SegmentsImplUtils.rangeBeforeIndex(this.breakIter, this.source, i); + } + + @Override + public Function rangeToSequenceFn() { + return SegmentsImplUtils.rangeToSequenceFn(this.source); + } + + @Override + public IntStream boundaries() { + return SegmentsImplUtils.boundaries(this.breakIter, this.source); + } + + @Override + public IntStream boundariesAfterIndex(int i) { + return SegmentsImplUtils.boundariesAfterIndex(this.breakIter, this.source, i); + } + + @Override + public IntStream boundariesBeforeIndex(int i) { + return SegmentsImplUtils.boundariesBeforeIndex(this.breakIter, this.source, i); } } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index e69f4b89e278..4babe4265a76 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -2,6 +2,9 @@ import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator; +import java.util.function.Function; +import java.util.stream.IntStream; +import java.util.stream.Stream; public class RuleBasedSegmenter implements Segmenter { @@ -65,13 +68,53 @@ public CharSequence getSourceSequence() { } @Override - public Segmenter getSegmenter() { - return segmenter; + public Stream subSequences() { + return SegmentsImplUtils.subSequences(this.breakIter, this.source); } @Override - public BreakIterator getInstanceBreakIterator() { - return this.breakIter; + public Stream ranges() { + return SegmentsImplUtils.ranges(this.breakIter, this.source); + } + + @Override + public Stream rangesAfterIndex(int i) { + return SegmentsImplUtils.rangesAfterIndex(this.breakIter, this.source, i); + } + + @Override + public Stream rangesBeforeIndex(int i) { + return SegmentsImplUtils.rangesBeforeIndex(this.breakIter, this.source, i); + } + + @Override + public Segment rangeAfterIndex(int i) { + return SegmentsImplUtils.rangeAfterIndex(this.breakIter, this.source, i); + } + + @Override + public Segment rangeBeforeIndex(int i) { + return SegmentsImplUtils.rangeBeforeIndex(this.breakIter, this.source, i); + } + + @Override + public Function rangeToSequenceFn() { + return SegmentsImplUtils.rangeToSequenceFn(this.source); + } + + @Override + public IntStream boundaries() { + return SegmentsImplUtils.boundaries(this.breakIter, this.source); + } + + @Override + public IntStream boundariesAfterIndex(int i) { + return SegmentsImplUtils.boundariesAfterIndex(this.breakIter, this.source, i); + } + + @Override + public IntStream boundariesBeforeIndex(int i) { + return SegmentsImplUtils.boundariesBeforeIndex(this.breakIter, this.source, i); } } } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 66897108252c..b2db9594fdc9 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -11,105 +11,25 @@ public interface Segments { CharSequence getSourceSequence(); - @Deprecated - Segmenter getSegmenter(); + Stream subSequences(); - @Deprecated - BreakIterator getInstanceBreakIterator(); + Stream ranges(); - default Stream subSequences() { - return ranges().map(rangeToSequenceFn()); - } - - default Stream ranges() { - return rangesAfterIndex(-1); - }; - - default Stream rangesAfterIndex(int i) { - BreakIterator breakIter = getInstanceBreakIterator(); - breakIter.setText(getSourceSequence()); - - // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager - SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.FORWARDS, i); - return StreamSupport.stream(iterable.spliterator(), false); - } - - default Stream rangesBeforeIndex(int i) { - BreakIterator breakIter = getInstanceBreakIterator(); - breakIter.setText(getSourceSequence()); - - // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager - SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.BACKWARDS, i); - return StreamSupport.stream(iterable.spliterator(), false); - } - - default Segment rangeAfterIndex(int i) { - BreakIterator breakIter = getInstanceBreakIterator(); - breakIter.setText(getSourceSequence()); - - int start = breakIter.following(i); - if (start == BreakIterator.DONE) { - return null; - } - - int limit = breakIter.next(); - if (limit == BreakIterator.DONE) { - return null; - } - - return new Segment(start, limit); - } - - default Segment rangeBeforeIndex(int i) { - BreakIterator breakIter = getInstanceBreakIterator(); - breakIter.setText(getSourceSequence()); - - - // TODO(ICU-22987): Remove after fixing preceding(int) to return `DONE` for negative inputs - if (i < 0) { - // return the same thing as we would if preceding() returned DONE - return null; - } - - int start = breakIter.preceding(i); - int limit = breakIter.previous(); - - if (start == BreakIterator.DONE || limit == BreakIterator.DONE) { - return null; - } + Stream rangesAfterIndex(int i); - assert limit <= start; + Stream rangesBeforeIndex(int i); - return new Segment(limit, start); - } + Segment rangeAfterIndex(int i); - default Function rangeToSequenceFn() { - return segment -> getSourceSequence().subSequence(segment.start, segment.limit); - } + Segment rangeBeforeIndex(int i); - default IntStream boundaries() { - return boundariesAfterIndex(-1); - } + Function rangeToSequenceFn(); - default IntStream boundariesAfterIndex(int i) { - BreakIterator breakIter = getInstanceBreakIterator(); - breakIter.setText(getSourceSequence()); + IntStream boundaries(); - // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager - BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.FORWARDS, i); - Stream boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false); - return boundariesAsIntegers.mapToInt(Integer::intValue); - } - - default IntStream boundariesBeforeIndex(int i) { - BreakIterator breakIter = getInstanceBreakIterator(); - breakIter.setText(getSourceSequence()); + IntStream boundariesAfterIndex(int i); - // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager - BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.BACKWARDS, i); - Stream boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false); - return boundariesAsIntegers.mapToInt(Integer::intValue); - } + IntStream boundariesBeforeIndex(int i); // // Inner enums/classes in common for other inner classes diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java new file mode 100644 index 000000000000..95e2888c6212 --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java @@ -0,0 +1,103 @@ +package com.ibm.icu.text.segmenter; + +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.segmenter.Segments.BoundaryIterable; +import com.ibm.icu.text.segmenter.Segments.IterationDirection; +import com.ibm.icu.text.segmenter.Segments.Segment; +import com.ibm.icu.text.segmenter.Segments.SegmentIterable; +import java.util.function.Function; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +public class SegmentsImplUtils { + + public static Stream subSequences(BreakIterator breakIter, CharSequence sourceSequence) { + return ranges(breakIter, sourceSequence).map(rangeToSequenceFn(sourceSequence)); + } + + public static Stream ranges(BreakIterator breakIter, CharSequence sourceSequence) { + return rangesAfterIndex(breakIter, sourceSequence, -1); + }; + + public static Stream rangesAfterIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + breakIter.setText(sourceSequence); + + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.FORWARDS, i); + return StreamSupport.stream(iterable.spliterator(), false); + } + + public static Stream rangesBeforeIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + breakIter.setText(sourceSequence); + + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.BACKWARDS, i); + return StreamSupport.stream(iterable.spliterator(), false); + } + + public static Segment rangeAfterIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + breakIter.setText(sourceSequence); + + int start = breakIter.following(i); + if (start == BreakIterator.DONE) { + return null; + } + + int limit = breakIter.next(); + if (limit == BreakIterator.DONE) { + return null; + } + + return new Segment(start, limit); + } + + public static Segment rangeBeforeIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + breakIter.setText(sourceSequence); + + + // TODO(ICU-22987): Remove after fixing preceding(int) to return `DONE` for negative inputs + if (i < 0) { + // return the same thing as we would if preceding() returned DONE + return null; + } + + int start = breakIter.preceding(i); + int limit = breakIter.previous(); + + if (start == BreakIterator.DONE || limit == BreakIterator.DONE) { + return null; + } + + assert limit <= start; + + return new Segment(limit, start); + } + + public static Function rangeToSequenceFn(CharSequence sourceSequence) { + return segment -> sourceSequence.subSequence(segment.start, segment.limit); + } + + public static IntStream boundaries(BreakIterator breakIter, CharSequence sourceSequence) { + return boundariesAfterIndex(breakIter, sourceSequence, -1); + } + + public static IntStream boundariesAfterIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + breakIter.setText(sourceSequence); + + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.FORWARDS, i); + Stream boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false); + return boundariesAsIntegers.mapToInt(Integer::intValue); + } + + public static IntStream boundariesBeforeIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + breakIter.setText(sourceSequence); + + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.BACKWARDS, i); + Stream boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false); + return boundariesAsIntegers.mapToInt(Integer::intValue); + } + +} From 8ee08e31c8cad621cb73abdc114e99df9fcf3f86 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 1 Jan 2025 18:05:30 -0800 Subject: [PATCH 27/43] ICU-22789 Add source CharSequence to Segment class --- .../com/ibm/icu/text/segmenter/Segments.java | 24 ++++++++++++------- .../icu/text/segmenter/SegmentsImplUtils.java | 8 +++---- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index b2db9594fdc9..9e072ae1711a 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -41,17 +41,19 @@ enum IterationDirection { } // - // Inner classes for Range, RangeIterable, and RangeIterator + // Inner classes for Segment, SegmentIterable, and SegmentIterator // class Segment { public final int start; public final int limit; public final int ruleStatus = 0; + public final CharSequence soruce; - public Segment(int start, int limit) { + public Segment(int start, int limit, CharSequence source) { this.start = start; this.limit = limit; + this.soruce = source; } } @@ -61,30 +63,34 @@ public Segment(int start, int limit) { */ class SegmentIterable implements Iterable { BreakIterator breakIter; - IterationDirection direction; + final IterationDirection direction; int startIdx; + final CharSequence source; - SegmentIterable(BreakIterator breakIter, IterationDirection direction, int startIdx) { + SegmentIterable(BreakIterator breakIter, IterationDirection direction, int startIdx, CharSequence source) { this.breakIter = breakIter; this.direction = direction; this.startIdx = startIdx; + this.source = source; } @Override public Iterator iterator() { - return new SegmentIterator(this.breakIter, this.direction, this.startIdx); + return new SegmentIterator(this.breakIter, this.direction, this.startIdx, this.source); } } class SegmentIterator implements Iterator { BreakIterator breakIter; - IterationDirection direction; + final IterationDirection direction; int start; int limit; + final CharSequence source; - SegmentIterator(BreakIterator breakIter, IterationDirection direction, int startIdx) { + SegmentIterator(BreakIterator breakIter, IterationDirection direction, int startIdx, CharSequence source) { this.breakIter = breakIter; this.direction = direction; + this.source = source; if (direction == IterationDirection.FORWARDS) { this.start = breakIter.following(startIdx); @@ -114,9 +120,9 @@ public boolean hasNext() { public Segment next() { Segment result; if (this.limit < this.start) { - result = new Segment(this.limit, this.start); + result = new Segment(this.limit, this.start, this.source); } else { - result = new Segment(this.start, this.limit); + result = new Segment(this.start, this.limit, this.source); } this.start = this.limit; diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java index 95e2888c6212..e88e9aea6c20 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java @@ -24,7 +24,7 @@ public static Stream rangesAfterIndex(BreakIterator breakIter, CharSequ breakIter.setText(sourceSequence); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager - SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.FORWARDS, i); + SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.FORWARDS, i, sourceSequence); return StreamSupport.stream(iterable.spliterator(), false); } @@ -32,7 +32,7 @@ public static Stream rangesBeforeIndex(BreakIterator breakIter, CharSeq breakIter.setText(sourceSequence); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager - SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.BACKWARDS, i); + SegmentIterable iterable = new SegmentIterable(breakIter, IterationDirection.BACKWARDS, i, sourceSequence); return StreamSupport.stream(iterable.spliterator(), false); } @@ -49,7 +49,7 @@ public static Segment rangeAfterIndex(BreakIterator breakIter, CharSequence sour return null; } - return new Segment(start, limit); + return new Segment(start, limit, sourceSequence); } public static Segment rangeBeforeIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { @@ -71,7 +71,7 @@ public static Segment rangeBeforeIndex(BreakIterator breakIter, CharSequence sou assert limit <= start; - return new Segment(limit, start); + return new Segment(limit, start, sourceSequence); } public static Function rangeToSequenceFn(CharSequence sourceSequence) { From 767789a48a6ca6d24f4b73931c511bf8a15ea268 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 1 Jan 2025 20:08:49 -0800 Subject: [PATCH 28/43] ICU-22789 Remove unused getters --- .../icu/text/segmenter/LocalizedSegmenter.java | 18 +++++------------- .../icu/text/segmenter/RuleBasedSegmenter.java | 10 ---------- .../com/ibm/icu/text/segmenter/Segments.java | 3 --- 3 files changed, 5 insertions(+), 26 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index 0ffb80940370..b5a070045506 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -17,14 +17,6 @@ public Segments segment(CharSequence s) { return new LocalizedSegments(s, this); } - public ULocale getLocale() { - return this.locale; - } - - public SegmentationType getSegmentationType() { - return this.segmentationType; - } - public static Builder builder() { return new Builder(); } @@ -34,7 +26,12 @@ public static Builder builder() { this.segmentationType = segmentationType; } + /** + * @Deprecated internal + * @return + */ @Override + @Deprecated public BreakIterator getNewBreakIterator() { BreakIterator breakIter; switch (this.segmentationType) { @@ -93,11 +90,6 @@ private LocalizedSegments(CharSequence source, LocalizedSegmenter segmenter) { this.breakIter = this.segmenter.getNewBreakIterator(); } - @Override - public CharSequence getSourceSequence() { - return this.source; - } - @Override public Stream subSequences() { return SegmentsImplUtils.subSequences(this.breakIter, this.source); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index 4babe4265a76..4085781605ae 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -10,16 +10,11 @@ public class RuleBasedSegmenter implements Segmenter { private String rules; - @Override public Segments segment(CharSequence s) { return new RuleBasedSegments(s, this); } - public String getRules() { - return this.rules; - } - public static Builder builder() { return new Builder(); } @@ -62,11 +57,6 @@ public static class RuleBasedSegments implements Segments { this.breakIter = this.segmenter.getNewBreakIterator(); } - @Override - public CharSequence getSourceSequence() { - return this.source; - } - @Override public Stream subSequences() { return SegmentsImplUtils.subSequences(this.breakIter, this.source); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 9e072ae1711a..0cc615768f86 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -8,9 +8,6 @@ import java.util.stream.StreamSupport; public interface Segments { - - CharSequence getSourceSequence(); - Stream subSequences(); Stream ranges(); From d9445d2aca5cbc49fa2c91c0912071d7bab5a3b5 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 1 Jan 2025 20:12:36 -0800 Subject: [PATCH 29/43] ICU-22789 Move SegmentationType enum back into LocalizedSegmenter --- .../com/ibm/icu/text/segmenter/LocalizedSegmenter.java | 7 +++++++ .../java/com/ibm/icu/text/segmenter/Segmenter.java | 6 ------ .../test/text/segmenter/LocalizedSegmenterTest.java | 2 +- .../ibm/icu/dev/test/text/segmenter/SegmentsTest.java | 10 +++++----- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index b5a070045506..8a241525b6d5 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -52,6 +52,13 @@ public BreakIterator getNewBreakIterator() { return breakIter; } + public enum SegmentationType { + GRAPHEME_CLUSTER, + WORD, + LINE, + SENTENCE, + } + public static class Builder { private ULocale locale = ULocale.ROOT; diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java index d38fa4609950..003f676b1788 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java @@ -8,10 +8,4 @@ public interface Segmenter { @Deprecated BreakIterator getNewBreakIterator(); - public enum SegmentationType { - GRAPHEME_CLUSTER, - WORD, - LINE, - SENTENCE, - } } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java index 477d8bca5d51..6e4c10f25db2 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java @@ -5,7 +5,7 @@ import com.ibm.icu.dev.test.CoreTestFmwk; import com.ibm.icu.text.segmenter.LocalizedSegmenter; -import com.ibm.icu.text.segmenter.Segmenter.SegmentationType; +import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType; import com.ibm.icu.text.segmenter.Segmenter; import com.ibm.icu.text.segmenter.Segments; import com.ibm.icu.util.ULocale; diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 74581e787b5f..33174b1f8e95 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -5,7 +5,7 @@ import com.ibm.icu.dev.test.CoreTestFmwk; import com.ibm.icu.text.segmenter.LocalizedSegmenter; -import com.ibm.icu.text.segmenter.Segmenter.SegmentationType; +import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType; import com.ibm.icu.text.segmenter.Segments; import com.ibm.icu.text.segmenter.Segments.Segment; import com.ibm.icu.util.ULocale; @@ -93,7 +93,7 @@ public void testRangesAfterIndex() { LocalizedSegmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) - .setSegmentationType(SegmentationType.WORD) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); String source1 = "The quick brown fox jumped over the lazy dog."; @@ -116,7 +116,7 @@ public void testRangesBeforeIndex() { LocalizedSegmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) - .setSegmentationType(SegmentationType.WORD) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); String source1 = "The quick brown fox jumped over the lazy dog."; @@ -139,7 +139,7 @@ public void testRangeToSequenceFn() { LocalizedSegmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) - .setSegmentationType(SegmentationType.WORD) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); String source1 = "The quick brown fox jumped over the lazy dog."; @@ -162,7 +162,7 @@ public void testRangeAfterIndex() { LocalizedSegmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) - .setSegmentationType(SegmentationType.WORD) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); String source = "The quick brown fox jumped over the lazy dog."; From 9efe2df23f814a84a08cb5291f017700e9a965d4 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 1 Jan 2025 20:35:40 -0800 Subject: [PATCH 30/43] ICU-22789 Mark getNewBreakIterator internal until we can remove it --- .../java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java | 4 ++-- .../java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java | 5 +++++ .../src/main/java/com/ibm/icu/text/segmenter/Segmenter.java | 4 ++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index 8a241525b6d5..d8a8d7c20000 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -27,8 +27,8 @@ public static Builder builder() { } /** - * @Deprecated internal - * @return + * @internal + * @deprecated This API is ICU internal only. */ @Override @Deprecated diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index 4085781605ae..87a5cf1e79ef 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -23,7 +23,12 @@ public static Builder builder() { this.rules = rules; } + /** + * @internal + * @deprecated This API is ICU internal only. + */ @Override + @Deprecated public RuleBasedBreakIterator getNewBreakIterator() { return new RuleBasedBreakIterator(this.rules); } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java index 003f676b1788..f761e08d8dab 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segmenter.java @@ -5,6 +5,10 @@ public interface Segmenter { Segments segment(CharSequence s); + /** + * @internal + * @deprecated This API is ICU internal only. + */ @Deprecated BreakIterator getNewBreakIterator(); From e0f3554b33855a032585588a6cc0c99946dbaa78 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 1 Jan 2025 20:38:21 -0800 Subject: [PATCH 31/43] ICU-22789 Use interface type in declarations used in tests --- .../dev/test/text/segmenter/SegmentsTest.java | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 33174b1f8e95..80a25de4d65b 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -6,6 +6,7 @@ import com.ibm.icu.dev.test.CoreTestFmwk; import com.ibm.icu.text.segmenter.LocalizedSegmenter; import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType; +import com.ibm.icu.text.segmenter.Segmenter; import com.ibm.icu.text.segmenter.Segments; import com.ibm.icu.text.segmenter.Segments.Segment; import com.ibm.icu.util.ULocale; @@ -21,7 +22,7 @@ public class SegmentsTest extends CoreTestFmwk { @Test public void testRanges() { - LocalizedSegmenter enWordSegmenter = + Segmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) @@ -43,7 +44,7 @@ public void testRanges() { @Test public void testMultipleSegmentObjectsFromSegmenter() { - LocalizedSegmenter enWordSegmenter = + Segmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) @@ -90,7 +91,7 @@ public void testMultipleSegmentObjectsFromSegmenter() { @Test public void testRangesAfterIndex() { - LocalizedSegmenter enWordSegmenter = + Segmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) @@ -113,7 +114,7 @@ public void testRangesAfterIndex() { @Test public void testRangesBeforeIndex() { - LocalizedSegmenter enWordSegmenter = + Segmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) @@ -136,7 +137,7 @@ public void testRangesBeforeIndex() { @Test public void testRangeToSequenceFn() { - LocalizedSegmenter enWordSegmenter = + Segmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) @@ -159,7 +160,7 @@ public void testRangeToSequenceFn() { @Test public void testRangeAfterIndex() { - LocalizedSegmenter enWordSegmenter = + Segmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) @@ -199,7 +200,7 @@ public void testRangeAfterIndex() { @Test public void testRangeBeforeIndex() { - LocalizedSegmenter enWordSegmenter = + Segmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) @@ -243,7 +244,7 @@ public void testRangeBeforeIndex() { @Test public void testBoundaries() { - LocalizedSegmenter enWordSegmenter = + Segmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) @@ -263,7 +264,7 @@ public void testBoundaries() { @Test public void testBoundariesAfterIndex() { - LocalizedSegmenter enWordSegmenter = + Segmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) @@ -295,7 +296,7 @@ public void testBoundariesAfterIndex() { @Test public void testBoundariesBeforeIndex() { - LocalizedSegmenter enWordSegmenter = + Segmenter enWordSegmenter = LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) From f12d724f505fed4dd96e4fd360d02e7b9afe5555 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 1 Jan 2025 21:11:19 -0800 Subject: [PATCH 32/43] ICU-22789 Create top level classes for builders of concrete Segmenter types --- .../text/segmenter/LocalizedSegmenter.java | 28 ------------------- .../segmenter/LocalizedSegmenterBuilder.java | 27 ++++++++++++++++++ .../text/segmenter/RuleBasedSegmenter.java | 20 ------------- .../segmenter/RuleBasedSegmenterBuilder.java | 17 +++++++++++ .../segmenter/LocalizedSegmenterTest.java | 3 +- .../segmenter/RuleBasedSegmenterTest.java | 3 +- .../dev/test/text/segmenter/SegmentsTest.java | 21 +++++++------- 7 files changed, 59 insertions(+), 60 deletions(-) create mode 100644 icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenterBuilder.java create mode 100644 icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenterBuilder.java diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index d8a8d7c20000..64e24514a79b 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -17,10 +17,6 @@ public Segments segment(CharSequence s) { return new LocalizedSegments(s, this); } - public static Builder builder() { - return new Builder(); - } - LocalizedSegmenter(ULocale locale, SegmentationType segmentationType) { this.locale = locale; this.segmentationType = segmentationType; @@ -59,30 +55,6 @@ public enum SegmentationType { SENTENCE, } - public static class Builder { - - private ULocale locale = ULocale.ROOT; - - private SegmentationType segmentationType = SegmentationType.GRAPHEME_CLUSTER; - - Builder() { } - - public Builder setLocale(ULocale locale) { - this.locale = locale; - return this; - } - - public Builder setSegmentationType(SegmentationType segmentationType) { - this.segmentationType = segmentationType; - return this; - } - - public LocalizedSegmenter build() { - return new LocalizedSegmenter(this.locale, this.segmentationType); - } - - } - public class LocalizedSegments implements Segments { private CharSequence source; diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenterBuilder.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenterBuilder.java new file mode 100644 index 000000000000..ffd3138a19de --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenterBuilder.java @@ -0,0 +1,27 @@ +package com.ibm.icu.text.segmenter; + +import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType; +import com.ibm.icu.util.ULocale; + +public class LocalizedSegmenterBuilder { + + private ULocale locale = ULocale.ROOT; + + private SegmentationType segmentationType = SegmentationType.GRAPHEME_CLUSTER; + + public LocalizedSegmenterBuilder() { } + + public LocalizedSegmenterBuilder setLocale(ULocale locale) { + this.locale = locale; + return this; + } + + public LocalizedSegmenterBuilder setSegmentationType(SegmentationType segmentationType) { + this.segmentationType = segmentationType; + return this; + } + + public LocalizedSegmenter build() { + return new LocalizedSegmenter(this.locale, this.segmentationType); + } +} diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index 87a5cf1e79ef..d20d4518227d 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -15,10 +15,6 @@ public Segments segment(CharSequence s) { return new RuleBasedSegments(s, this); } - public static Builder builder() { - return new Builder(); - } - RuleBasedSegmenter(String rules) { this.rules = rules; } @@ -33,22 +29,6 @@ public RuleBasedBreakIterator getNewBreakIterator() { return new RuleBasedBreakIterator(this.rules); } - public static class Builder { - - String rules; - - Builder() { } - - public Builder setRules(String rules) { - this.rules = rules; - return this; - } - - public RuleBasedSegmenter build() { - return new RuleBasedSegmenter(this.rules); - } - } - public static class RuleBasedSegments implements Segments { private CharSequence source; diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenterBuilder.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenterBuilder.java new file mode 100644 index 000000000000..774f212ddca5 --- /dev/null +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenterBuilder.java @@ -0,0 +1,17 @@ +package com.ibm.icu.text.segmenter; + +public class RuleBasedSegmenterBuilder { + + String rules; + + public RuleBasedSegmenterBuilder() { } + + public RuleBasedSegmenterBuilder setRules(String rules) { + this.rules = rules; + return this; + } + + public RuleBasedSegmenter build() { + return new RuleBasedSegmenter(this.rules); + } +} diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java index 6e4c10f25db2..6ba380eabdf1 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java @@ -6,6 +6,7 @@ import com.ibm.icu.dev.test.CoreTestFmwk; import com.ibm.icu.text.segmenter.LocalizedSegmenter; import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType; +import com.ibm.icu.text.segmenter.LocalizedSegmenterBuilder; import com.ibm.icu.text.segmenter.Segmenter; import com.ibm.icu.text.segmenter.Segments; import com.ibm.icu.util.ULocale; @@ -34,7 +35,7 @@ public void testLocaleInLocalizedSegmenter() { List expWords = (List) caseDatum[1]; Segmenter wordSeg = - LocalizedSegmenter.builder() + new LocalizedSegmenterBuilder() .setLocale(locale) .setSegmentationType(SegmentationType.WORD) .build(); diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java index 5e46fe608038..6d08e1314702 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java @@ -5,6 +5,7 @@ import com.ibm.icu.dev.test.CoreTestFmwk; import com.ibm.icu.text.segmenter.RuleBasedSegmenter; +import com.ibm.icu.text.segmenter.RuleBasedSegmenterBuilder; import com.ibm.icu.text.segmenter.Segmenter; import com.ibm.icu.text.segmenter.Segments; import java.util.Arrays; @@ -34,7 +35,7 @@ public void testRules() { // the following rule substring was taken as a subset from BreakIteratorRules_en_US_TEST.java: String rules = subrule; - Segmenter seg = RuleBasedSegmenter.builder() + Segmenter seg = new RuleBasedSegmenterBuilder() .setRules(rules) .build(); Segments segments = seg.segment(source); diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 80a25de4d65b..31eca89fc2ca 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -6,6 +6,7 @@ import com.ibm.icu.dev.test.CoreTestFmwk; import com.ibm.icu.text.segmenter.LocalizedSegmenter; import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType; +import com.ibm.icu.text.segmenter.LocalizedSegmenterBuilder; import com.ibm.icu.text.segmenter.Segmenter; import com.ibm.icu.text.segmenter.Segments; import com.ibm.icu.text.segmenter.Segments.Segment; @@ -23,7 +24,7 @@ public class SegmentsTest extends CoreTestFmwk { @Test public void testRanges() { Segmenter enWordSegmenter = - LocalizedSegmenter.builder() + new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); @@ -45,7 +46,7 @@ public void testRanges() { @Test public void testMultipleSegmentObjectsFromSegmenter() { Segmenter enWordSegmenter = - LocalizedSegmenter.builder() + new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); @@ -92,7 +93,7 @@ public void testMultipleSegmentObjectsFromSegmenter() { @Test public void testRangesAfterIndex() { Segmenter enWordSegmenter = - LocalizedSegmenter.builder() + new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); @@ -115,7 +116,7 @@ public void testRangesAfterIndex() { @Test public void testRangesBeforeIndex() { Segmenter enWordSegmenter = - LocalizedSegmenter.builder() + new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); @@ -138,7 +139,7 @@ public void testRangesBeforeIndex() { @Test public void testRangeToSequenceFn() { Segmenter enWordSegmenter = - LocalizedSegmenter.builder() + new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); @@ -161,7 +162,7 @@ public void testRangeToSequenceFn() { @Test public void testRangeAfterIndex() { Segmenter enWordSegmenter = - LocalizedSegmenter.builder() + new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); @@ -201,7 +202,7 @@ public void testRangeAfterIndex() { @Test public void testRangeBeforeIndex() { Segmenter enWordSegmenter = - LocalizedSegmenter.builder() + new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); @@ -245,7 +246,7 @@ public void testRangeBeforeIndex() { @Test public void testBoundaries() { Segmenter enWordSegmenter = - LocalizedSegmenter.builder() + new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); @@ -265,7 +266,7 @@ public void testBoundaries() { @Test public void testBoundariesAfterIndex() { Segmenter enWordSegmenter = - LocalizedSegmenter.builder() + new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); @@ -297,7 +298,7 @@ public void testBoundariesAfterIndex() { @Test public void testBoundariesBeforeIndex() { Segmenter enWordSegmenter = - LocalizedSegmenter.builder() + new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); From 0e2b1dbb81d95563dd6d14aaa29e965e700ab6e4 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Thu, 2 Jan 2025 10:00:08 -0800 Subject: [PATCH 33/43] ICU-22789 Add isBoundary API for Segments interface --- .../text/segmenter/LocalizedSegmenter.java | 5 +++ .../text/segmenter/RuleBasedSegmenter.java | 5 +++ .../com/ibm/icu/text/segmenter/Segments.java | 8 +++++ .../icu/text/segmenter/SegmentsImplUtils.java | 8 ++++- .../dev/test/text/segmenter/SegmentsTest.java | 32 +++++++++++++++++++ 5 files changed, 57 insertions(+), 1 deletion(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index 64e24514a79b..f35f283a75ea 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -79,6 +79,11 @@ public Stream ranges() { return SegmentsImplUtils.ranges(this.breakIter, this.source); } + @Override + public boolean isBoundary(int i) { + return SegmentsImplUtils.isBoundary(this.breakIter, this.source, i); + } + @Override public Stream rangesAfterIndex(int i) { return SegmentsImplUtils.rangesAfterIndex(this.breakIter, this.source, i); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index d20d4518227d..71f44696d207 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -52,6 +52,11 @@ public Stream ranges() { return SegmentsImplUtils.ranges(this.breakIter, this.source); } + @Override + public boolean isBoundary(int i) { + return SegmentsImplUtils.isBoundary(this.breakIter, this.source, i); + } + @Override public Stream rangesAfterIndex(int i) { return SegmentsImplUtils.rangesAfterIndex(this.breakIter, this.source, i); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 0cc615768f86..b05bd416bcf9 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -12,6 +12,14 @@ public interface Segments { Stream ranges(); + /** + * Returns whether offset {@code i} is a segmentation boundary. Throws an exception when + * {@code i} is not a valid boundary position for the source sequence. + * @param i + * @return + */ + boolean isBoundary(int i); + Stream rangesAfterIndex(int i); Stream rangesBeforeIndex(int i); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java index e88e9aea6c20..7ee6f66ec914 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java @@ -12,13 +12,19 @@ public class SegmentsImplUtils { + public static boolean isBoundary(BreakIterator breakIter, CharSequence source, int i) { + breakIter.setText(source); + + return breakIter.isBoundary(i); + } + public static Stream subSequences(BreakIterator breakIter, CharSequence sourceSequence) { return ranges(breakIter, sourceSequence).map(rangeToSequenceFn(sourceSequence)); } public static Stream ranges(BreakIterator breakIter, CharSequence sourceSequence) { return rangesAfterIndex(breakIter, sourceSequence, -1); - }; + } public static Stream rangesAfterIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { breakIter.setText(sourceSequence); diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 31eca89fc2ca..100eccf57265 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -90,6 +90,38 @@ public void testMultipleSegmentObjectsFromSegmenter() { assertThat(act2, is(exp2)); } + @Test + public void testIsBoundary() { + Segmenter enWordSegmenter = + new LocalizedSegmenterBuilder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + Object[][] casesData = { + {"start of segment", 4, true}, + {"between start and limit of segment", 6, false}, + {"limit of segment", 9, true}, + {"beginning of string", 0, true}, + {"end of string", source1.length(), true}, + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + int idx = (int) caseDatum[1]; + boolean exp = (boolean) caseDatum[2]; + + assertThat(desc, segments1.isBoundary(idx) == exp); + } + + + } + @Test public void testRangesAfterIndex() { Segmenter enWordSegmenter = From 9fbcc8a05577a53139993fd332b47f33461594d7 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 3 Jan 2025 08:07:40 -0800 Subject: [PATCH 34/43] ICU-22789 Rename boundariesAfter API for Segments interface --- .../com/ibm/icu/text/segmenter/LocalizedSegmenter.java | 4 ++-- .../com/ibm/icu/text/segmenter/RuleBasedSegmenter.java | 4 ++-- .../src/main/java/com/ibm/icu/text/segmenter/Segments.java | 3 +-- .../java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java | 4 ++-- .../com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java | 7 ++++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index f35f283a75ea..095227777d91 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -115,8 +115,8 @@ public IntStream boundaries() { } @Override - public IntStream boundariesAfterIndex(int i) { - return SegmentsImplUtils.boundariesAfterIndex(this.breakIter, this.source, i); + public IntStream boundariesAfter(int i) { + return SegmentsImplUtils.boundariesAfter(this.breakIter, this.source, i); } @Override diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index 71f44696d207..a8526deed4f7 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -88,8 +88,8 @@ public IntStream boundaries() { } @Override - public IntStream boundariesAfterIndex(int i) { - return SegmentsImplUtils.boundariesAfterIndex(this.breakIter, this.source, i); + public IntStream boundariesAfter(int i) { + return SegmentsImplUtils.boundariesAfter(this.breakIter, this.source, i); } @Override diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index b05bd416bcf9..de7ebef497cd 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -5,7 +5,6 @@ import java.util.function.Function; import java.util.stream.IntStream; import java.util.stream.Stream; -import java.util.stream.StreamSupport; public interface Segments { Stream subSequences(); @@ -32,7 +31,7 @@ public interface Segments { IntStream boundaries(); - IntStream boundariesAfterIndex(int i); + IntStream boundariesAfter(int i); IntStream boundariesBeforeIndex(int i); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java index 7ee6f66ec914..56dfe5ad2f1b 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java @@ -85,10 +85,10 @@ public static Function rangeToSequenceFn(CharSequence sou } public static IntStream boundaries(BreakIterator breakIter, CharSequence sourceSequence) { - return boundariesAfterIndex(breakIter, sourceSequence, -1); + return boundariesAfter(breakIter, sourceSequence, -1); } - public static IntStream boundariesAfterIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + public static IntStream boundariesAfter(BreakIterator breakIter, CharSequence sourceSequence, int i) { breakIter.setText(sourceSequence); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 100eccf57265..c8b8d12676b5 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -296,7 +296,7 @@ public void testBoundaries() { } @Test - public void testBoundariesAfterIndex() { + public void testBoundariesAfter() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -312,6 +312,7 @@ public void testBoundariesAfterIndex() { Object[][] casesData = { {"first " + TAKE_LIMIT + " before beginning", -2, new int[]{0, 3, 4, 9, 10}}, {"first " + TAKE_LIMIT + " in the middle of the third segment", 5, new int[]{9, 10, 15, 16, 19}}, + {"first " + TAKE_LIMIT + " on the limit of the third segment", 9, new int[]{10, 15, 16, 19, 20}}, {"first " + TAKE_LIMIT + " at the end", source.length(), new int[0]}, {"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[0]}, }; @@ -321,9 +322,9 @@ public void testBoundariesAfterIndex() { int startIdx = (int) caseDatum[1]; int[] exp = (int[]) caseDatum[2]; - int[] act = segments.boundariesAfterIndex(startIdx).limit(TAKE_LIMIT).toArray(); + int[] act = segments.boundariesAfter(startIdx).limit(TAKE_LIMIT).toArray(); - assertThat(act, is(exp)); + assertThat(desc, act, is(exp)); } } From 4163a6d89890d1870e8e30db4e3812273c3ea828 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 3 Jan 2025 08:42:12 -0800 Subject: [PATCH 35/43] ICU-22789 Rename and adjust boundary logic for boundariesBackFrom API for Segments interface --- .../icu/text/segmenter/LocalizedSegmenter.java | 4 ++-- .../icu/text/segmenter/RuleBasedSegmenter.java | 4 ++-- .../com/ibm/icu/text/segmenter/Segments.java | 2 +- .../icu/text/segmenter/SegmentsImplUtils.java | 15 +++++++++++++-- .../dev/test/text/segmenter/SegmentsTest.java | 18 ++++++++++-------- 5 files changed, 28 insertions(+), 15 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index 095227777d91..323081d372f6 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -120,8 +120,8 @@ public IntStream boundariesAfter(int i) { } @Override - public IntStream boundariesBeforeIndex(int i) { - return SegmentsImplUtils.boundariesBeforeIndex(this.breakIter, this.source, i); + public IntStream boundariesBackFrom(int i) { + return SegmentsImplUtils.boundariesBackFrom(this.breakIter, this.source, i); } } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index a8526deed4f7..6a30516b70c8 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -93,8 +93,8 @@ public IntStream boundariesAfter(int i) { } @Override - public IntStream boundariesBeforeIndex(int i) { - return SegmentsImplUtils.boundariesBeforeIndex(this.breakIter, this.source, i); + public IntStream boundariesBackFrom(int i) { + return SegmentsImplUtils.boundariesBackFrom(this.breakIter, this.source, i); } } } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index de7ebef497cd..58da59ef4bd6 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -33,7 +33,7 @@ public interface Segments { IntStream boundariesAfter(int i); - IntStream boundariesBeforeIndex(int i); + IntStream boundariesBackFrom(int i); // // Inner enums/classes in common for other inner classes diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java index 56dfe5ad2f1b..f23a1d646635 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java @@ -10,6 +10,8 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; + +// Global TODO: make initialization of breakIterator a prerequisite public class SegmentsImplUtils { public static boolean isBoundary(BreakIterator breakIter, CharSequence source, int i) { @@ -97,11 +99,20 @@ public static IntStream boundariesAfter(BreakIterator breakIter, CharSequence so return boundariesAsIntegers.mapToInt(Integer::intValue); } - public static IntStream boundariesBeforeIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + public static IntStream boundariesBackFrom(BreakIterator breakIter, CharSequence sourceSequence, int i) { + // TODO: make initialization of breakIterator a prerequisite breakIter.setText(sourceSequence); + int sourceLength = sourceSequence.length(); + if (i < 0) { + return IntStream.empty(); + } + + boolean isOnBoundary = i <= sourceLength && isBoundary(breakIter, sourceSequence, i); + int backFromIdx = isOnBoundary ? i + 1 : i; + // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager - BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.BACKWARDS, i); + BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.BACKWARDS, backFromIdx); Stream boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false); return boundariesAsIntegers.mapToInt(Integer::intValue); } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index c8b8d12676b5..46088fe66f00 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -329,7 +329,7 @@ public void testBoundariesAfter() { } @Test - public void testBoundariesBeforeIndex() { + public void testBoundariesBackFrom() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -343,10 +343,12 @@ public void testBoundariesBeforeIndex() { Segments segments = enWordSegmenter.segment(source); Object[][] casesData = { - {"first " + TAKE_LIMIT + " before beginning", -2, new int[0]}, - {"first " + TAKE_LIMIT + " at the beginning", 0, new int[0]}, - {"first " + TAKE_LIMIT + " in the middle of the 2nd to last", 42, new int[]{41, 40, 36, 35, 32}}, - {"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[]{45, 44, 41, 40, 36}}, + {"first " + TAKE_LIMIT + " before beginning", -2, new int[0]}, + {"first " + TAKE_LIMIT + " at the beginning", 0, new int[]{0}}, + {"first " + TAKE_LIMIT + " from the start of the 2nd to last segment", 41, new int[]{41, 40, 36, 35, 32}}, + {"first " + TAKE_LIMIT + " in the middle of the 2nd to last segment", 42, new int[]{41, 40, 36, 35, 32}}, + {"first " + TAKE_LIMIT + " at the end", source.length(), new int[]{45, 44, 41, 40, 36}}, + {"first " + TAKE_LIMIT + " after the end", source.length()+1, new int[]{45, 44, 41, 40, 36}}, }; for (Object[] caseDatum : casesData) { @@ -354,11 +356,11 @@ public void testBoundariesBeforeIndex() { int startIdx = (int) caseDatum[1]; int[] exp = (int[]) caseDatum[2]; - int[] act = segments.boundariesBeforeIndex(startIdx).limit(TAKE_LIMIT).toArray(); + int[] act = segments.boundariesBackFrom(startIdx).limit(TAKE_LIMIT).toArray(); - assertThat(act, is(exp)); + assertThat(desc, act, is(exp)); - if (startIdx == -2) { + if (startIdx < 0) { logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); } } From d9017e0408771789c6331be12612978dcde70cb1 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 3 Jan 2025 11:19:57 -0800 Subject: [PATCH 36/43] ICU-22789 Fix typos, add TODOs for future optimization design --- .../com/ibm/icu/text/segmenter/Segments.java | 25 +++++++++++-------- .../icu/text/segmenter/SegmentsImplUtils.java | 2 ++ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 58da59ef4bd6..6b5aba6ab4b0 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -11,14 +11,6 @@ public interface Segments { Stream ranges(); - /** - * Returns whether offset {@code i} is a segmentation boundary. Throws an exception when - * {@code i} is not a valid boundary position for the source sequence. - * @param i - * @return - */ - boolean isBoundary(int i); - Stream rangesAfterIndex(int i); Stream rangesBeforeIndex(int i); @@ -29,6 +21,14 @@ public interface Segments { Function rangeToSequenceFn(); + /** + * Returns whether offset {@code i} is a segmentation boundary. Throws an exception when + * {@code i} is not a valid boundary position for the source sequence. + * @param i + * @return + */ + boolean isBoundary(int i); + IntStream boundaries(); IntStream boundariesAfter(int i); @@ -48,16 +48,21 @@ enum IterationDirection { // Inner classes for Segment, SegmentIterable, and SegmentIterator // + // TODO: consider options in design for potential memory usage optimization: + // 1) keep simple class with public fields, but requires field per Segment to point to source + // 2) make Segment an interface (getSource, getStart, getLimit, getRuleStatus, newSegment), and + // maybe an abstract class that implements the interface, maybe with a default method impl + // for convenience for getting (allocating & returning) the subsequence class Segment { public final int start; public final int limit; public final int ruleStatus = 0; - public final CharSequence soruce; + public final CharSequence source; public Segment(int start, int limit, CharSequence source) { this.start = start; this.limit = limit; - this.soruce = source; + this.source = source; } } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java index f23a1d646635..0623a789524f 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java @@ -94,6 +94,7 @@ public static IntStream boundariesAfter(BreakIterator breakIter, CharSequence so breakIter.setText(sourceSequence); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + // TODO: optimize IntStream creation to avoid autoboxing BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.FORWARDS, i); Stream boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false); return boundariesAsIntegers.mapToInt(Integer::intValue); @@ -112,6 +113,7 @@ public static IntStream boundariesBackFrom(BreakIterator breakIter, CharSequence int backFromIdx = isOnBoundary ? i + 1 : i; // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager + // TODO: optimize IntStream creation to avoid autoboxing BoundaryIterable iterable = new BoundaryIterable(breakIter, IterationDirection.BACKWARDS, backFromIdx); Stream boundariesAsIntegers = StreamSupport.stream(iterable.spliterator(), false); return boundariesAsIntegers.mapToInt(Integer::intValue); From 47ffdd8fa9c15ff97c24fd318e6e66477c0a5d63 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 3 Jan 2025 12:20:54 -0800 Subject: [PATCH 37/43] ICU-22789 Add segmentAt API for Segments interface --- .../text/segmenter/LocalizedSegmenter.java | 5 ++ .../text/segmenter/RuleBasedSegmenter.java | 5 ++ .../com/ibm/icu/text/segmenter/Segments.java | 2 + .../icu/text/segmenter/SegmentsImplUtils.java | 26 ++++++++++ .../dev/test/text/segmenter/SegmentsTest.java | 47 ++++++++++++++++++- 5 files changed, 84 insertions(+), 1 deletion(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index 323081d372f6..9e3d34e75b1c 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -74,6 +74,11 @@ public Stream subSequences() { return SegmentsImplUtils.subSequences(this.breakIter, this.source); } + @Override + public Segment segmentAt(int i) { + return SegmentsImplUtils.segmentAt(this.breakIter, this.source, i); + } + @Override public Stream ranges() { return SegmentsImplUtils.ranges(this.breakIter, this.source); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index 6a30516b70c8..13a1846fc841 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -47,6 +47,11 @@ public Stream subSequences() { return SegmentsImplUtils.subSequences(this.breakIter, this.source); } + @Override + public Segment segmentAt(int i) { + return SegmentsImplUtils.segmentAt(this.breakIter, this.source, i); + } + @Override public Stream ranges() { return SegmentsImplUtils.ranges(this.breakIter, this.source); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 6b5aba6ab4b0..1e3dd9bacd5b 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -9,6 +9,8 @@ public interface Segments { Stream subSequences(); + Segment segmentAt(int i); + Stream ranges(); Stream rangesAfterIndex(int i); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java index 0623a789524f..7b0259994018 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java @@ -24,6 +24,32 @@ public static Stream subSequences(BreakIterator breakIter, CharSeq return ranges(breakIter, sourceSequence).map(rangeToSequenceFn(sourceSequence)); } + public static Segment segmentAt(BreakIterator breakIter, CharSequence sourceSequence, int i) { + // TODO: make initialization of breakIterator a prerequisite + breakIter.setText(sourceSequence); + + int start; + int limit; + + boolean isBoundary = breakIter.isBoundary(i); + + if (isBoundary) { + start = i; + limit = breakIter.next(); + } else { + // BreakIterator::isBoundary(i) will advance forwards to the next boundary if the argument + // is not a boundary. + limit = breakIter.current(); + start = breakIter.previous(); + } + + if (start != BreakIterator.DONE && limit != BreakIterator.DONE) { + return new Segment(start, limit, sourceSequence); + } else { + return null; + } + } + public static Stream ranges(BreakIterator breakIter, CharSequence sourceSequence) { return rangesAfterIndex(breakIter, sourceSequence, -1); } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 46088fe66f00..3d02520dd7bd 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -261,7 +261,7 @@ public void testRangeBeforeIndex() { Segment segment = segments.rangeBeforeIndex(startIdx); - if (startIdx == -2) { + if (startIdx < 0 ) { logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); } @@ -366,4 +366,49 @@ public void testBoundariesBackFrom() { } } + @Test + public void testSegmentAt() { + Segmenter enWordSegmenter = + new LocalizedSegmenterBuilder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(SegmentationType.WORD) + .build(); + + String source = "The quick brown fox jumped over the lazy dog."; + + // Create new Segments for source + Segments segments1 = enWordSegmenter.segment(source); + + Object[][] casesData = { + {"index before beginning", -2, null, null}, + {"index at beginning", 0, 0, 3}, + {"index in the middle of the first segment", 2, 0, 3}, + {"index in the middle of the third segment", 5, 4, 9}, + {"index at the end", source.length()-1, 44, 45}, + {"index after the end", source.length()+1, null, null}, + }; + + for (Object[] caseDatum : casesData) { + String desc = (String) caseDatum[0]; + int startIdx = (int) caseDatum[1]; + Integer expStart = (Integer) caseDatum[2]; + Integer expLimit = (Integer) caseDatum[3]; + + if (startIdx < 0 ) { + logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); + } + + if (expStart == null) { + assertThat("Out of bounds range should be null", expLimit == null); + } else { + Segment segment = segments1.segmentAt(startIdx); + + assertEquals(desc + ", start", (long) expStart.intValue(), (long) segment.start); + assertEquals(desc + ", limit", (long) expLimit.intValue(), (long) segment.limit); + } + } + + + } + } From e500e421dcb1f66edb9a5b4c99057155348a5d94 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 3 Jan 2025 12:49:17 -0800 Subject: [PATCH 38/43] ICU-22789 Rename and adjust logic for Stream-returning APIs --- .../text/segmenter/LocalizedSegmenter.java | 12 +++++------ .../text/segmenter/RuleBasedSegmenter.java | 12 +++++------ .../com/ibm/icu/text/segmenter/Segments.java | 20 +++++++++++-------- .../icu/text/segmenter/SegmentsImplUtils.java | 10 +++++----- .../dev/test/text/segmenter/SegmentsTest.java | 20 +++++++++---------- 5 files changed, 39 insertions(+), 35 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index 9e3d34e75b1c..ceac7911f347 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -80,8 +80,8 @@ public Segment segmentAt(int i) { } @Override - public Stream ranges() { - return SegmentsImplUtils.ranges(this.breakIter, this.source); + public Stream segments() { + return SegmentsImplUtils.segments(this.breakIter, this.source); } @Override @@ -91,21 +91,21 @@ public boolean isBoundary(int i) { @Override public Stream rangesAfterIndex(int i) { - return SegmentsImplUtils.rangesAfterIndex(this.breakIter, this.source, i); + return SegmentsImplUtils.segmentsFrom(this.breakIter, this.source, i); } @Override public Stream rangesBeforeIndex(int i) { - return SegmentsImplUtils.rangesBeforeIndex(this.breakIter, this.source, i); + return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i); } @Override - public Segment rangeAfterIndex(int i) { + public Segment segmentsFrom(int i) { return SegmentsImplUtils.rangeAfterIndex(this.breakIter, this.source, i); } @Override - public Segment rangeBeforeIndex(int i) { + public Segment segmentsBefore(int i) { return SegmentsImplUtils.rangeBeforeIndex(this.breakIter, this.source, i); } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index 13a1846fc841..a0e6577f771f 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -53,8 +53,8 @@ public Segment segmentAt(int i) { } @Override - public Stream ranges() { - return SegmentsImplUtils.ranges(this.breakIter, this.source); + public Stream segments() { + return SegmentsImplUtils.segments(this.breakIter, this.source); } @Override @@ -64,21 +64,21 @@ public boolean isBoundary(int i) { @Override public Stream rangesAfterIndex(int i) { - return SegmentsImplUtils.rangesAfterIndex(this.breakIter, this.source, i); + return SegmentsImplUtils.segmentsFrom(this.breakIter, this.source, i); } @Override public Stream rangesBeforeIndex(int i) { - return SegmentsImplUtils.rangesBeforeIndex(this.breakIter, this.source, i); + return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i); } @Override - public Segment rangeAfterIndex(int i) { + public Segment segmentsFrom(int i) { return SegmentsImplUtils.rangeAfterIndex(this.breakIter, this.source, i); } @Override - public Segment rangeBeforeIndex(int i) { + public Segment segmentsBefore(int i) { return SegmentsImplUtils.rangeBeforeIndex(this.breakIter, this.source, i); } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 1e3dd9bacd5b..dcd7dda17deb 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -11,15 +11,15 @@ public interface Segments { Segment segmentAt(int i); - Stream ranges(); + Stream segments(); Stream rangesAfterIndex(int i); Stream rangesBeforeIndex(int i); - Segment rangeAfterIndex(int i); + Segment segmentsFrom(int i); - Segment rangeBeforeIndex(int i); + Segment segmentsBefore(int i); Function rangeToSequenceFn(); @@ -103,14 +103,18 @@ class SegmentIterator implements Iterator { this.direction = direction; this.source = source; - if (direction == IterationDirection.FORWARDS) { - this.start = breakIter.following(startIdx); + Segment segmentAtIdx = SegmentsImplUtils.segmentAt(breakIter, source, startIdx); + + if (segmentAtIdx == null) { + this.start = BreakIterator.DONE; + } else if (direction == IterationDirection.FORWARDS) { + this.start = segmentAtIdx.start; + this.limit = breakIter.following(this.start); } else { assert direction == IterationDirection.BACKWARDS; - this.start = breakIter.preceding(startIdx); + this.start = breakIter.preceding(segmentAtIdx.start); + this.limit = getDirectionBasedNextIdx(); } - - this.limit = getDirectionBasedNextIdx(); } int getDirectionBasedNextIdx() { diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java index 7b0259994018..546e0306cae2 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java @@ -21,7 +21,7 @@ public static boolean isBoundary(BreakIterator breakIter, CharSequence source, i } public static Stream subSequences(BreakIterator breakIter, CharSequence sourceSequence) { - return ranges(breakIter, sourceSequence).map(rangeToSequenceFn(sourceSequence)); + return segments(breakIter, sourceSequence).map(rangeToSequenceFn(sourceSequence)); } public static Segment segmentAt(BreakIterator breakIter, CharSequence sourceSequence, int i) { @@ -50,11 +50,11 @@ public static Segment segmentAt(BreakIterator breakIter, CharSequence sourceSequ } } - public static Stream ranges(BreakIterator breakIter, CharSequence sourceSequence) { - return rangesAfterIndex(breakIter, sourceSequence, -1); + public static Stream segments(BreakIterator breakIter, CharSequence sourceSequence) { + return segmentsFrom(breakIter, sourceSequence, 0); } - public static Stream rangesAfterIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + public static Stream segmentsFrom(BreakIterator breakIter, CharSequence sourceSequence, int i) { breakIter.setText(sourceSequence); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager @@ -62,7 +62,7 @@ public static Stream rangesAfterIndex(BreakIterator breakIter, CharSequ return StreamSupport.stream(iterable.spliterator(), false); } - public static Stream rangesBeforeIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + public static Stream segmentsBefore(BreakIterator breakIter, CharSequence sourceSequence, int i) { breakIter.setText(sourceSequence); // create a Stream from a Spliterator of an Iterable so that the Stream can be lazy, not eager diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 3d02520dd7bd..259f9b53f78b 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -22,7 +22,7 @@ public class SegmentsTest extends CoreTestFmwk { @Test - public void testRanges() { + public void testSegments() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -34,7 +34,7 @@ public void testRanges() { // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); - List segments = segments1.ranges().collect(Collectors.toList()); + List segments = segments1.segments().collect(Collectors.toList()); assertEquals("first range start", 0, segments.get(0).start); assertEquals("first range limit", 3, segments.get(0).limit); @@ -123,7 +123,7 @@ public void testIsBoundary() { } @Test - public void testRangesAfterIndex() { + public void testSegmentsFrom() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -138,15 +138,15 @@ public void testRangesAfterIndex() { List segments = segments1.rangesAfterIndex(startIdx).collect(Collectors.toList()); - assertEquals("first range start", 3, segments.get(0).start); - assertEquals("first range limit", 4, segments.get(0).limit); + assertEquals("first range start", 0, segments.get(0).start); + assertEquals("first range limit", 3, segments.get(0).limit); - assertEquals("second range start", 4, segments.get(1).start); - assertEquals("second range limit", 9, segments.get(1).limit); + assertEquals("second range start", 3, segments.get(1).start); + assertEquals("second range limit", 4, segments.get(1).limit); } @Test - public void testRangesBeforeIndex() { + public void testSegmentsBefore() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -218,7 +218,7 @@ public void testRangeAfterIndex() { Integer expStart = (Integer) caseDatum[2]; Integer expLimit = (Integer) caseDatum[3]; - Segment segment = segments.rangeAfterIndex(startIdx); + Segment segment = segments.segmentsFrom(startIdx); if (expStart == null) { assert expLimit == null; @@ -259,7 +259,7 @@ public void testRangeBeforeIndex() { Integer expStart = (Integer) caseDatum[2]; Integer expLimit = (Integer) caseDatum[3]; - Segment segment = segments.rangeBeforeIndex(startIdx); + Segment segment = segments.segmentsBefore(startIdx); if (startIdx < 0 ) { logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); From f9f4d041de677c34934247c2370d46561abc21c7 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 3 Jan 2025 20:42:16 -0800 Subject: [PATCH 39/43] ICU-22789 Fix boundary condition behavior for segmentsBefore API --- .../com/ibm/icu/text/segmenter/Segments.java | 12 +++- .../dev/test/text/segmenter/SegmentsTest.java | 56 +++++++++++++++++-- 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index dcd7dda17deb..f4004904db3a 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -112,7 +112,17 @@ class SegmentIterator implements Iterator { this.limit = breakIter.following(this.start); } else { assert direction == IterationDirection.BACKWARDS; - this.start = breakIter.preceding(segmentAtIdx.start); + if (breakIter.isBoundary(startIdx)) { + // Note: breakIter::isBoundary is a stateful operation. It resets the position in the + // BreakIterator, which we want to ensure that the position is where we think it is. + this.start = startIdx; + } else { + // Since we already called BreakIterator.isBoundary() which mutates the BreakIterator + // position to increment forwards when the return value is false, we should call + // BreakIterator.previous() to update the iterator position while getting the start value + // of the segment at startIdx + this.start = breakIter.previous(); + } this.limit = getDirectionBasedNextIdx(); } } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 259f9b53f78b..6f6b3da956eb 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -118,12 +118,10 @@ public void testIsBoundary() { assertThat(desc, segments1.isBoundary(idx) == exp); } - - } @Test - public void testSegmentsFrom() { + public void testSegmentsFrom_middleOfSegment() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -146,7 +144,7 @@ public void testSegmentsFrom() { } @Test - public void testSegmentsBefore() { + public void testSegmentsFrom_onBoundary() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -154,7 +152,53 @@ public void testSegmentsBefore() { .build(); String source1 = "The quick brown fox jumped over the lazy dog."; - int startIdx = 10; + int startIdx = 3; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List segments = segments1.rangesAfterIndex(startIdx).collect(Collectors.toList()); + + assertEquals("first range start", 3, segments.get(0).start); + assertEquals("first range limit", 4, segments.get(0).limit); + + assertEquals("second range start", 4, segments.get(1).start); + assertEquals("second range limit", 9, segments.get(1).limit); + } + + @Test + public void testSegmentsBefore_middleOfSegment() { + Segmenter enWordSegmenter = + new LocalizedSegmenterBuilder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 8; + + // Create new Segments for source1 + Segments segments1 = enWordSegmenter.segment(source1); + + List segments = segments1.rangesBeforeIndex(startIdx).collect(Collectors.toList()); + + assertEquals("first range start", 3, segments.get(0).start); + assertEquals("first range limit", 4, segments.get(0).limit); + + assertEquals("second range start", 0, segments.get(1).start); + assertEquals("second range limit", 3, segments.get(1).limit); + } + + @Test + public void testSegmentsBefore_onBoundary() { + Segmenter enWordSegmenter = + new LocalizedSegmenterBuilder() + .setLocale(ULocale.ENGLISH) + .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) + .build(); + + String source1 = "The quick brown fox jumped over the lazy dog."; + int startIdx = 9; // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); @@ -182,7 +226,7 @@ public void testRangeToSequenceFn() { // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); - List exp1 = Arrays.asList("quick", " ", "The"); + List exp1 = Arrays.asList(" ", "quick", " ", "The"); List act1 = segments1.rangesBeforeIndex(startIdx) .map(segments1.rangeToSequenceFn()) From cf979a4f0f6de65cd5429bfd53fb1d5b0601e45a Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 3 Jan 2025 21:07:51 -0800 Subject: [PATCH 40/43] ICU-22789 Fix naming of APIs further --- .../text/segmenter/LocalizedSegmenter.java | 16 +++++++------- .../text/segmenter/RuleBasedSegmenter.java | 16 +++++++------- .../com/ibm/icu/text/segmenter/Segments.java | 11 +++++----- .../icu/text/segmenter/SegmentsImplUtils.java | 8 +++---- .../dev/test/text/segmenter/SegmentsTest.java | 22 +++++++++---------- 5 files changed, 37 insertions(+), 36 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index ceac7911f347..f2ebcf690b13 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -90,28 +90,28 @@ public boolean isBoundary(int i) { } @Override - public Stream rangesAfterIndex(int i) { + public Stream segmentsFrom(int i) { return SegmentsImplUtils.segmentsFrom(this.breakIter, this.source, i); } @Override - public Stream rangesBeforeIndex(int i) { + public Stream segmentsBefore(int i) { return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i); } @Override - public Segment segmentsFrom(int i) { - return SegmentsImplUtils.rangeAfterIndex(this.breakIter, this.source, i); + public Segment segmentAfterIndex(int i) { + return SegmentsImplUtils.segmentAfterIndex(this.breakIter, this.source, i); } @Override - public Segment segmentsBefore(int i) { - return SegmentsImplUtils.rangeBeforeIndex(this.breakIter, this.source, i); + public Segment segmentBeforeIndex(int i) { + return SegmentsImplUtils.segmentBeforeIndex(this.breakIter, this.source, i); } @Override - public Function rangeToSequenceFn() { - return SegmentsImplUtils.rangeToSequenceFn(this.source); + public Function segmentToSequenceFn() { + return SegmentsImplUtils.segmentToSequenceFn(this.source); } @Override diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index a0e6577f771f..54ee59051b76 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -63,28 +63,28 @@ public boolean isBoundary(int i) { } @Override - public Stream rangesAfterIndex(int i) { + public Stream segmentsFrom(int i) { return SegmentsImplUtils.segmentsFrom(this.breakIter, this.source, i); } @Override - public Stream rangesBeforeIndex(int i) { + public Stream segmentsBefore(int i) { return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i); } @Override - public Segment segmentsFrom(int i) { - return SegmentsImplUtils.rangeAfterIndex(this.breakIter, this.source, i); + public Segment segmentAfterIndex(int i) { + return SegmentsImplUtils.segmentAfterIndex(this.breakIter, this.source, i); } @Override - public Segment segmentsBefore(int i) { - return SegmentsImplUtils.rangeBeforeIndex(this.breakIter, this.source, i); + public Segment segmentBeforeIndex(int i) { + return SegmentsImplUtils.segmentBeforeIndex(this.breakIter, this.source, i); } @Override - public Function rangeToSequenceFn() { - return SegmentsImplUtils.rangeToSequenceFn(this.source); + public Function segmentToSequenceFn() { + return SegmentsImplUtils.segmentToSequenceFn(this.source); } @Override diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index f4004904db3a..9faf1efa5950 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -7,21 +7,22 @@ import java.util.stream.Stream; public interface Segments { + Stream subSequences(); Segment segmentAt(int i); Stream segments(); - Stream rangesAfterIndex(int i); + Stream segmentsFrom(int i); - Stream rangesBeforeIndex(int i); + Stream segmentsBefore(int i); - Segment segmentsFrom(int i); + Segment segmentAfterIndex(int i); - Segment segmentsBefore(int i); + Segment segmentBeforeIndex(int i); - Function rangeToSequenceFn(); + Function segmentToSequenceFn(); /** * Returns whether offset {@code i} is a segmentation boundary. Throws an exception when diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java index 546e0306cae2..09a521c2e6d5 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/SegmentsImplUtils.java @@ -21,7 +21,7 @@ public static boolean isBoundary(BreakIterator breakIter, CharSequence source, i } public static Stream subSequences(BreakIterator breakIter, CharSequence sourceSequence) { - return segments(breakIter, sourceSequence).map(rangeToSequenceFn(sourceSequence)); + return segments(breakIter, sourceSequence).map(segmentToSequenceFn(sourceSequence)); } public static Segment segmentAt(BreakIterator breakIter, CharSequence sourceSequence, int i) { @@ -70,7 +70,7 @@ public static Stream segmentsBefore(BreakIterator breakIter, CharSequen return StreamSupport.stream(iterable.spliterator(), false); } - public static Segment rangeAfterIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + public static Segment segmentAfterIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { breakIter.setText(sourceSequence); int start = breakIter.following(i); @@ -86,7 +86,7 @@ public static Segment rangeAfterIndex(BreakIterator breakIter, CharSequence sour return new Segment(start, limit, sourceSequence); } - public static Segment rangeBeforeIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { + public static Segment segmentBeforeIndex(BreakIterator breakIter, CharSequence sourceSequence, int i) { breakIter.setText(sourceSequence); @@ -108,7 +108,7 @@ public static Segment rangeBeforeIndex(BreakIterator breakIter, CharSequence sou return new Segment(limit, start, sourceSequence); } - public static Function rangeToSequenceFn(CharSequence sourceSequence) { + public static Function segmentToSequenceFn(CharSequence sourceSequence) { return segment -> sourceSequence.subSequence(segment.start, segment.limit); } diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 6f6b3da956eb..2eae72a577ec 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -134,7 +134,7 @@ public void testSegmentsFrom_middleOfSegment() { // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); - List segments = segments1.rangesAfterIndex(startIdx).collect(Collectors.toList()); + List segments = segments1.segmentsFrom(startIdx).collect(Collectors.toList()); assertEquals("first range start", 0, segments.get(0).start); assertEquals("first range limit", 3, segments.get(0).limit); @@ -157,7 +157,7 @@ public void testSegmentsFrom_onBoundary() { // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); - List segments = segments1.rangesAfterIndex(startIdx).collect(Collectors.toList()); + List segments = segments1.segmentsFrom(startIdx).collect(Collectors.toList()); assertEquals("first range start", 3, segments.get(0).start); assertEquals("first range limit", 4, segments.get(0).limit); @@ -180,7 +180,7 @@ public void testSegmentsBefore_middleOfSegment() { // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); - List segments = segments1.rangesBeforeIndex(startIdx).collect(Collectors.toList()); + List segments = segments1.segmentsBefore(startIdx).collect(Collectors.toList()); assertEquals("first range start", 3, segments.get(0).start); assertEquals("first range limit", 4, segments.get(0).limit); @@ -203,7 +203,7 @@ public void testSegmentsBefore_onBoundary() { // Create new Segments for source1 Segments segments1 = enWordSegmenter.segment(source1); - List segments = segments1.rangesBeforeIndex(startIdx).collect(Collectors.toList()); + List segments = segments1.segmentsBefore(startIdx).collect(Collectors.toList()); assertEquals("first range start", 4, segments.get(0).start); assertEquals("first range limit", 9, segments.get(0).limit); @@ -213,7 +213,7 @@ public void testSegmentsBefore_onBoundary() { } @Test - public void testRangeToSequenceFn() { + public void testSegmentToSequenceFn() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -228,15 +228,15 @@ public void testRangeToSequenceFn() { List exp1 = Arrays.asList(" ", "quick", " ", "The"); - List act1 = segments1.rangesBeforeIndex(startIdx) - .map(segments1.rangeToSequenceFn()) + List act1 = segments1.segmentsBefore(startIdx) + .map(segments1.segmentToSequenceFn()) .collect(Collectors.toList()); assertThat(act1, is(exp1)); } @Test - public void testRangeAfterIndex() { + public void testSegmentAfterIndex() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -262,7 +262,7 @@ public void testRangeAfterIndex() { Integer expStart = (Integer) caseDatum[2]; Integer expLimit = (Integer) caseDatum[3]; - Segment segment = segments.segmentsFrom(startIdx); + Segment segment = segments.segmentAfterIndex(startIdx); if (expStart == null) { assert expLimit == null; @@ -276,7 +276,7 @@ public void testRangeAfterIndex() { @Test - public void testRangeBeforeIndex() { + public void testSegmentBeforeIndex() { Segmenter enWordSegmenter = new LocalizedSegmenterBuilder() .setLocale(ULocale.ENGLISH) @@ -303,7 +303,7 @@ public void testRangeBeforeIndex() { Integer expStart = (Integer) caseDatum[2]; Integer expLimit = (Integer) caseDatum[3]; - Segment segment = segments.segmentsBefore(startIdx); + Segment segment = segments.segmentBeforeIndex(startIdx); if (startIdx < 0 ) { logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); From f59006802f9321f6e09dd0ebb4787092c02332c3 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Thu, 9 Jan 2025 14:04:34 -0800 Subject: [PATCH 41/43] ICU-22789 Fix localized segmenter test by not expecting locale tailorings --- .../dev/test/text/segmenter/LocalizedSegmenterTest.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java index 6ba380eabdf1..139e9247f547 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java @@ -22,11 +22,10 @@ public class LocalizedSegmenterTest extends CoreTestFmwk { @Test public void testLocaleInLocalizedSegmenter() { - String source = "k:a"; + String source = "Die 21en Jahrh. ist die Beste."; Object[][] casesData = { - {"en", Arrays.asList("k", ":", "a")}, - {"sv", Arrays.asList("k:a")} + {"de", Arrays.asList("Die 21en Jahrh. ist die Beste.")}, }; for (Object[] caseDatum : casesData) { @@ -37,7 +36,7 @@ public void testLocaleInLocalizedSegmenter() { Segmenter wordSeg = new LocalizedSegmenterBuilder() .setLocale(locale) - .setSegmentationType(SegmentationType.WORD) + .setSegmentationType(SegmentationType.SENTENCE) .build(); Segments segments = wordSeg.segment(source); From 036eb6e45b8651f9b8197dac8996269678277e8d Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Thu, 9 Jan 2025 14:09:38 -0800 Subject: [PATCH 42/43] Revert "ICU-22789 Create top level classes for builders of concrete Segmenter types" Also fix additional new usages of `LocalizedSegmenterBuilder` accordingly This reverts commit f12d724f505fed4dd96e4fd360d02e7b9afe5555. --- .../text/segmenter/LocalizedSegmenter.java | 28 ++++++++++++++++++ .../segmenter/LocalizedSegmenterBuilder.java | 27 ----------------- .../text/segmenter/RuleBasedSegmenter.java | 20 +++++++++++++ .../segmenter/RuleBasedSegmenterBuilder.java | 17 ----------- .../segmenter/LocalizedSegmenterTest.java | 3 +- .../segmenter/RuleBasedSegmenterTest.java | 3 +- .../dev/test/text/segmenter/SegmentsTest.java | 29 +++++++++---------- 7 files changed, 64 insertions(+), 63 deletions(-) delete mode 100644 icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenterBuilder.java delete mode 100644 icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenterBuilder.java diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index f2ebcf690b13..098a648f7a0f 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -17,6 +17,10 @@ public Segments segment(CharSequence s) { return new LocalizedSegments(s, this); } + public static Builder builder() { + return new Builder(); + } + LocalizedSegmenter(ULocale locale, SegmentationType segmentationType) { this.locale = locale; this.segmentationType = segmentationType; @@ -55,6 +59,30 @@ public enum SegmentationType { SENTENCE, } + public static class Builder { + + private ULocale locale = ULocale.ROOT; + + private SegmentationType segmentationType = SegmentationType.GRAPHEME_CLUSTER; + + Builder() { } + + public Builder setLocale(ULocale locale) { + this.locale = locale; + return this; + } + + public Builder setSegmentationType(SegmentationType segmentationType) { + this.segmentationType = segmentationType; + return this; + } + + public LocalizedSegmenter build() { + return new LocalizedSegmenter(this.locale, this.segmentationType); + } + + } + public class LocalizedSegments implements Segments { private CharSequence source; diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenterBuilder.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenterBuilder.java deleted file mode 100644 index ffd3138a19de..000000000000 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenterBuilder.java +++ /dev/null @@ -1,27 +0,0 @@ -package com.ibm.icu.text.segmenter; - -import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType; -import com.ibm.icu.util.ULocale; - -public class LocalizedSegmenterBuilder { - - private ULocale locale = ULocale.ROOT; - - private SegmentationType segmentationType = SegmentationType.GRAPHEME_CLUSTER; - - public LocalizedSegmenterBuilder() { } - - public LocalizedSegmenterBuilder setLocale(ULocale locale) { - this.locale = locale; - return this; - } - - public LocalizedSegmenterBuilder setSegmentationType(SegmentationType segmentationType) { - this.segmentationType = segmentationType; - return this; - } - - public LocalizedSegmenter build() { - return new LocalizedSegmenter(this.locale, this.segmentationType); - } -} diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index 54ee59051b76..cd181e5360f2 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -15,6 +15,10 @@ public Segments segment(CharSequence s) { return new RuleBasedSegments(s, this); } + public static Builder builder() { + return new Builder(); + } + RuleBasedSegmenter(String rules) { this.rules = rules; } @@ -29,6 +33,22 @@ public RuleBasedBreakIterator getNewBreakIterator() { return new RuleBasedBreakIterator(this.rules); } + public static class Builder { + + String rules; + + Builder() { } + + public Builder setRules(String rules) { + this.rules = rules; + return this; + } + + public RuleBasedSegmenter build() { + return new RuleBasedSegmenter(this.rules); + } + } + public static class RuleBasedSegments implements Segments { private CharSequence source; diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenterBuilder.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenterBuilder.java deleted file mode 100644 index 774f212ddca5..000000000000 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenterBuilder.java +++ /dev/null @@ -1,17 +0,0 @@ -package com.ibm.icu.text.segmenter; - -public class RuleBasedSegmenterBuilder { - - String rules; - - public RuleBasedSegmenterBuilder() { } - - public RuleBasedSegmenterBuilder setRules(String rules) { - this.rules = rules; - return this; - } - - public RuleBasedSegmenter build() { - return new RuleBasedSegmenter(this.rules); - } -} diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java index 139e9247f547..810b388d3482 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/LocalizedSegmenterTest.java @@ -6,7 +6,6 @@ import com.ibm.icu.dev.test.CoreTestFmwk; import com.ibm.icu.text.segmenter.LocalizedSegmenter; import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType; -import com.ibm.icu.text.segmenter.LocalizedSegmenterBuilder; import com.ibm.icu.text.segmenter.Segmenter; import com.ibm.icu.text.segmenter.Segments; import com.ibm.icu.util.ULocale; @@ -34,7 +33,7 @@ public void testLocaleInLocalizedSegmenter() { List expWords = (List) caseDatum[1]; Segmenter wordSeg = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(locale) .setSegmentationType(SegmentationType.SENTENCE) .build(); diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java index 6d08e1314702..5e46fe608038 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/RuleBasedSegmenterTest.java @@ -5,7 +5,6 @@ import com.ibm.icu.dev.test.CoreTestFmwk; import com.ibm.icu.text.segmenter.RuleBasedSegmenter; -import com.ibm.icu.text.segmenter.RuleBasedSegmenterBuilder; import com.ibm.icu.text.segmenter.Segmenter; import com.ibm.icu.text.segmenter.Segments; import java.util.Arrays; @@ -35,7 +34,7 @@ public void testRules() { // the following rule substring was taken as a subset from BreakIteratorRules_en_US_TEST.java: String rules = subrule; - Segmenter seg = new RuleBasedSegmenterBuilder() + Segmenter seg = RuleBasedSegmenter.builder() .setRules(rules) .build(); Segments segments = seg.segment(source); diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index 2eae72a577ec..ce77c6eb125d 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -6,7 +6,6 @@ import com.ibm.icu.dev.test.CoreTestFmwk; import com.ibm.icu.text.segmenter.LocalizedSegmenter; import com.ibm.icu.text.segmenter.LocalizedSegmenter.SegmentationType; -import com.ibm.icu.text.segmenter.LocalizedSegmenterBuilder; import com.ibm.icu.text.segmenter.Segmenter; import com.ibm.icu.text.segmenter.Segments; import com.ibm.icu.text.segmenter.Segments.Segment; @@ -24,7 +23,7 @@ public class SegmentsTest extends CoreTestFmwk { @Test public void testSegments() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); @@ -46,7 +45,7 @@ public void testSegments() { @Test public void testMultipleSegmentObjectsFromSegmenter() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); @@ -93,7 +92,7 @@ public void testMultipleSegmentObjectsFromSegmenter() { @Test public void testIsBoundary() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); @@ -123,7 +122,7 @@ public void testIsBoundary() { @Test public void testSegmentsFrom_middleOfSegment() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); @@ -146,7 +145,7 @@ public void testSegmentsFrom_middleOfSegment() { @Test public void testSegmentsFrom_onBoundary() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); @@ -169,7 +168,7 @@ public void testSegmentsFrom_onBoundary() { @Test public void testSegmentsBefore_middleOfSegment() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); @@ -192,7 +191,7 @@ public void testSegmentsBefore_middleOfSegment() { @Test public void testSegmentsBefore_onBoundary() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); @@ -215,7 +214,7 @@ public void testSegmentsBefore_onBoundary() { @Test public void testSegmentToSequenceFn() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); @@ -238,7 +237,7 @@ public void testSegmentToSequenceFn() { @Test public void testSegmentAfterIndex() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) .build(); @@ -278,7 +277,7 @@ public void testSegmentAfterIndex() { @Test public void testSegmentBeforeIndex() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); @@ -322,7 +321,7 @@ public void testSegmentBeforeIndex() { @Test public void testBoundaries() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); @@ -342,7 +341,7 @@ public void testBoundaries() { @Test public void testBoundariesAfter() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); @@ -375,7 +374,7 @@ public void testBoundariesAfter() { @Test public void testBoundariesBackFrom() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); @@ -413,7 +412,7 @@ public void testBoundariesBackFrom() { @Test public void testSegmentAt() { Segmenter enWordSegmenter = - new LocalizedSegmenterBuilder() + LocalizedSegmenter.builder() .setLocale(ULocale.ENGLISH) .setSegmentationType(SegmentationType.WORD) .build(); From 124c487d804fb1610821f8402fdf3bda4a9ec2ce Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Thu, 9 Jan 2025 14:14:12 -0800 Subject: [PATCH 43/43] ICU-22789 Remove `segmentAfterIndex` and `segmentBeforeIndex` --- .../text/segmenter/LocalizedSegmenter.java | 10 --- .../text/segmenter/RuleBasedSegmenter.java | 10 --- .../com/ibm/icu/text/segmenter/Segments.java | 4 - .../dev/test/text/segmenter/SegmentsTest.java | 84 ------------------- 4 files changed, 108 deletions(-) diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java index 098a648f7a0f..0e297c483dd9 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/LocalizedSegmenter.java @@ -127,16 +127,6 @@ public Stream segmentsBefore(int i) { return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i); } - @Override - public Segment segmentAfterIndex(int i) { - return SegmentsImplUtils.segmentAfterIndex(this.breakIter, this.source, i); - } - - @Override - public Segment segmentBeforeIndex(int i) { - return SegmentsImplUtils.segmentBeforeIndex(this.breakIter, this.source, i); - } - @Override public Function segmentToSequenceFn() { return SegmentsImplUtils.segmentToSequenceFn(this.source); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java index cd181e5360f2..18f32ae78fd9 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/RuleBasedSegmenter.java @@ -92,16 +92,6 @@ public Stream segmentsBefore(int i) { return SegmentsImplUtils.segmentsBefore(this.breakIter, this.source, i); } - @Override - public Segment segmentAfterIndex(int i) { - return SegmentsImplUtils.segmentAfterIndex(this.breakIter, this.source, i); - } - - @Override - public Segment segmentBeforeIndex(int i) { - return SegmentsImplUtils.segmentBeforeIndex(this.breakIter, this.source, i); - } - @Override public Function segmentToSequenceFn() { return SegmentsImplUtils.segmentToSequenceFn(this.source); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java index 9faf1efa5950..8840b1c96fce 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/segmenter/Segments.java @@ -18,10 +18,6 @@ public interface Segments { Stream segmentsBefore(int i); - Segment segmentAfterIndex(int i); - - Segment segmentBeforeIndex(int i); - Function segmentToSequenceFn(); /** diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java index ce77c6eb125d..f22be7231f8d 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/text/segmenter/SegmentsTest.java @@ -234,90 +234,6 @@ public void testSegmentToSequenceFn() { assertThat(act1, is(exp1)); } - @Test - public void testSegmentAfterIndex() { - Segmenter enWordSegmenter = - LocalizedSegmenter.builder() - .setLocale(ULocale.ENGLISH) - .setSegmentationType(LocalizedSegmenter.SegmentationType.WORD) - .build(); - - String source = "The quick brown fox jumped over the lazy dog."; - - // Create new Segments for source - Segments segments = enWordSegmenter.segment(source); - - Object[][] casesData = { - {"index before beginning", -2, 0, 3}, - {"index at beginning", 0, 3, 4}, - {"index in the middle (end of first segment)", 3, 4, 9}, - {"index at the end", source.length()-1, null, null}, - {"index after the end", source.length()+1, null, null}, - }; - - for (Object[] caseDatum : casesData) { - String desc = (String) caseDatum[0]; - int startIdx = (int) caseDatum[1]; - Integer expStart = (Integer) caseDatum[2]; - Integer expLimit = (Integer) caseDatum[3]; - - Segment segment = segments.segmentAfterIndex(startIdx); - - if (expStart == null) { - assert expLimit == null; - assertThat("Out of bounds range should be null", segment == null); - } else { - assertEquals(desc + ", start", (long) expStart.intValue(), segment.start); - assertEquals(desc + ", limit", (long) expLimit.intValue(), segment.limit); - } - } - } - - - @Test - public void testSegmentBeforeIndex() { - Segmenter enWordSegmenter = - LocalizedSegmenter.builder() - .setLocale(ULocale.ENGLISH) - .setSegmentationType(SegmentationType.WORD) - .build(); - - String source = "The quick brown fox jumped over the lazy dog."; - - // Create new Segments for source - Segments segments = enWordSegmenter.segment(source); - - Object[][] casesData = { - {"index before beginning", -2, null, null}, - {"index at beginning", 0, null, null}, - {"index in the middle of the first segment", 2, null, null}, - {"index in the middle of the third segment", 5, 3, 4}, - {"index at the end", source.length()-1, 40, 41}, - {"index after the end", source.length()+1, source.length()-1, source.length()}, - }; - - for (Object[] caseDatum : casesData) { - String desc = (String) caseDatum[0]; - int startIdx = (int) caseDatum[1]; - Integer expStart = (Integer) caseDatum[2]; - Integer expLimit = (Integer) caseDatum[3]; - - Segment segment = segments.segmentBeforeIndex(startIdx); - - if (startIdx < 0 ) { - logKnownIssue("ICU-22987", "BreakIterator.preceding(-2) should return DONE, not 0"); - } - - if (expStart == null) { - assert expLimit == null; - assertThat("Out of bounds range should be null", segment == null); - } else { - assertEquals(desc + ", start", (long) expStart.intValue(), (long) segment.start); - assertEquals(desc + ", limit", (long) expLimit.intValue(), (long) segment.limit); - } - } - } - @Test public void testBoundaries() { Segmenter enWordSegmenter =