diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index b57f172428ac..7a8d3e712bcc 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -703,11 +703,26 @@ public static int indexOf(final UTF8String target, final UTF8String pattern, final int start, final int collationId) { if (pattern.numBytes() == 0) return target.indexOfEmpty(start); if (target.numBytes() == 0) return MATCH_NOT_FOUND; - - StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId); - stringSearch.setIndex(start); - - return stringSearch.next(); + // Initialize the string search with respect to the specified ICU collation. + String targetStr = target.toValidString(); + String patternStr = pattern.toValidString(); + StringSearch stringSearch = + CollationFactory.getStringSearch(targetStr, patternStr, collationId); + stringSearch.setOverlapping(true); + // Start the search from `start`-th code point (NOT necessarily from the `start`-th character). + int startIndex = targetStr.offsetByCodePoints(0, start); + stringSearch.setIndex(startIndex); + // Perform the search and return the next result, starting from the specified position. + int searchIndex = stringSearch.next(); + if (searchIndex == StringSearch.DONE) { + return MATCH_NOT_FOUND; + } + // Convert the search index from character count to code point count. + int indexOf = targetStr.codePointCount(0, searchIndex); + if (indexOf < start) { + return MATCH_NOT_FOUND; + } + return indexOf; } private static int find(UTF8String target, UTF8String pattern, int start, diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 4301bf56b6d5..bc14a019a463 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -858,8 +858,12 @@ public void testInitCap() throws SparkException { "Ss Fi Ffi Ff St Σημερινος Ασημενιος İota"); } - private void assertStringInstr(String string, String substring, String collationName, - Integer expected) throws SparkException { + /** + * Verify the behaviour of the `StringInstr` collation support class. + */ + + private void assertStringInstr(String string, String substring, + String collationName, int expected) throws SparkException { UTF8String str = UTF8String.fromString(string); UTF8String substr = UTF8String.fromString(substring); int collationId = CollationFactory.collationNameToId(collationName); @@ -868,56 +872,85 @@ private void assertStringInstr(String string, String substring, String collation @Test public void testStringInstr() throws SparkException { - assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0); - assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0); + // Empty strings. + assertStringInstr("", "", "UTF8_BINARY", 1); + assertStringInstr("", "", "UTF8_LCASE", 1); + assertStringInstr("", "", "UNICODE_CI", 1); + assertStringInstr("", "", "UNICODE", 1); + assertStringInstr("a", "", "UTF8_BINARY", 1); + assertStringInstr("a", "", "UTF8_LCASE", 1); + assertStringInstr("a", "", "UNICODE", 1); + assertStringInstr("a", "", "UNICODE_CI", 1); + assertStringInstr("", "x", "UTF8_BINARY", 0); + assertStringInstr("", "x", "UTF8_LCASE", 0); + assertStringInstr("", "x", "UNICODE", 0); + assertStringInstr("", "x", "UNICODE_CI", 0); + // Basic tests. + assertStringInstr("aaads", "aa", "UTF8_BINARY", 1); + assertStringInstr("aaads", "aa", "UTF8_LCASE", 1); + assertStringInstr("aaads", "aa", "UNICODE", 1); + assertStringInstr("aaads", "aa", "UNICODE_CI", 1); assertStringInstr("aaads", "ds", "UTF8_BINARY", 4); - assertStringInstr("xxxx", "", "UTF8_BINARY", 1); - assertStringInstr("", "xxxx", "UTF8_BINARY", 0); - assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5); - assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8); + assertStringInstr("aaads", "ds", "UTF8_LCASE", 4); + assertStringInstr("aaads", "ds", "UNICODE", 4); + assertStringInstr("aaads", "ds", "UNICODE_CI", 4); + assertStringInstr("aaads", "Aa", "UTF8_BINARY", 0); assertStringInstr("aaads", "Aa", "UTF8_LCASE", 1); + assertStringInstr("aaads", "Aa", "UNICODE", 0); + assertStringInstr("aaads", "Aa", "UNICODE_CI", 1); + assertStringInstr("aaaDs", "de", "UTF8_BINARY", 0); assertStringInstr("aaaDs", "de", "UTF8_LCASE", 0); + assertStringInstr("aaaDs", "de", "UNICODE", 0); + assertStringInstr("aaaDs", "de", "UNICODE_CI", 0); + assertStringInstr("aaaDs", "ds", "UTF8_BINARY", 0); assertStringInstr("aaaDs", "ds", "UTF8_LCASE", 4); - assertStringInstr("xxxx", "", "UTF8_LCASE", 1); - assertStringInstr("", "xxxx", "UTF8_LCASE", 0); + assertStringInstr("aaaDs", "ds", "UNICODE", 0); + assertStringInstr("aaaDs", "ds", "UNICODE_CI", 4); + assertStringInstr("aaadS", "Ds", "UTF8_BINARY", 0); + assertStringInstr("aaadS", "Ds", "UTF8_LCASE", 4); + assertStringInstr("aaadS", "Ds", "UNICODE", 0); + assertStringInstr("aaadS", "Ds", "UNICODE_CI", 4); + // Advanced tests. + assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5); assertStringInstr("test大千世界X大千世界", "大千", "UTF8_LCASE", 5); + assertStringInstr("test大千世界X大千世界", "大千", "UNICODE", 5); + assertStringInstr("test大千世界X大千世界", "大千", "UNICODE_CI", 5); + assertStringInstr("test大千世界X大千世界", "界X", "UTF8_BINARY", 8); + assertStringInstr("test大千世界X大千世界", "界X", "UTF8_LCASE", 8); + assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8); + assertStringInstr("test大千世界X大千世界", "界X", "UNICODE_CI", 8); + assertStringInstr("test大千世界X大千世界", "界x", "UTF8_BINARY", 0); assertStringInstr("test大千世界X大千世界", "界x", "UTF8_LCASE", 8); - assertStringInstr("aaads", "Aa", "UNICODE", 0); - assertStringInstr("aaads", "aa", "UNICODE", 1); - assertStringInstr("aaads", "de", "UNICODE", 0); - assertStringInstr("xxxx", "", "UNICODE", 1); - assertStringInstr("", "xxxx", "UNICODE", 0); assertStringInstr("test大千世界X大千世界", "界x", "UNICODE", 0); - assertStringInstr("test大千世界X大千世界", "界X", "UNICODE", 8); - assertStringInstr("xxxx", "", "UNICODE_CI", 1); - assertStringInstr("", "xxxx", "UNICODE_CI", 0); - assertStringInstr("aaads", "AD", "UNICODE_CI", 3); - assertStringInstr("aaads", "dS", "UNICODE_CI", 4); - assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8); - assertStringInstr("i̇", "i", "UNICODE_CI", 0); - assertStringInstr("i̇", "\u0307", "UNICODE_CI", 0); - assertStringInstr("i̇", "İ", "UNICODE_CI", 1); + assertStringInstr("test大千世界X大千世界", "界y", "UTF8_BINARY", 0); + assertStringInstr("test大千世界X大千世界", "界y", "UTF8_LCASE", 0); + assertStringInstr("test大千世界X大千世界", "界y", "UNICODE", 0); + assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); + // One-to-many case mapping (e.g. Turkish dotted I). + assertStringInstr("i\u0307", "i", "UNICODE_CI", 0); + assertStringInstr("i\u0307", "\u0307", "UNICODE_CI", 0); + assertStringInstr("i\u0307", "İ", "UNICODE_CI", 1); assertStringInstr("İ", "i", "UNICODE_CI", 0); - assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1); + assertStringInstr("İoi̇o12", "i\u0307o", "UNICODE_CI", 1); assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1); - assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3); + assertStringInstr("abİoi̇o", "i\u0307o", "UNICODE_CI", 3); assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3); assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5); assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7); - assertStringInstr("i̇", "i", "UTF8_LCASE", 1); // != UNICODE_CI - assertStringInstr("i̇", "\u0307", "UTF8_LCASE", 2); // != UNICODE_CI - assertStringInstr("i̇", "İ", "UTF8_LCASE", 1); + assertStringInstr("i\u0307", "i", "UTF8_LCASE", 1); // != UNICODE_CI + assertStringInstr("i\u0307", "\u0307", "UTF8_LCASE", 2); // != UNICODE_CI + assertStringInstr("i\u0307", "İ", "UTF8_LCASE", 1); assertStringInstr("İ", "i", "UTF8_LCASE", 0); - assertStringInstr("İoi̇o12", "i̇o", "UTF8_LCASE", 1); + assertStringInstr("İoi̇o12", "i\u0307o", "UTF8_LCASE", 1); assertStringInstr("i̇oİo12", "İo", "UTF8_LCASE", 1); - assertStringInstr("abİoi̇o", "i̇o", "UTF8_LCASE", 3); + assertStringInstr("abİoi̇o", "i\u0307o", "UTF8_LCASE", 3); assertStringInstr("abi̇oİo", "İo", "UTF8_LCASE", 3); assertStringInstr("abI\u0307oi̇o", "İo", "UTF8_LCASE", 3); assertStringInstr("ai̇oxXİo", "Xx", "UTF8_LCASE", 5); assertStringInstr("abİoi̇o", "\u0307o", "UTF8_LCASE", 6); assertStringInstr("aİoi̇oxx", "XX", "UTF8_LCASE", 7); - // Greek sigmas. + // Conditional case mapping (e.g. Greek sigmas). assertStringInstr("σ", "σ", "UTF8_BINARY", 1); assertStringInstr("σ", "ς", "UTF8_BINARY", 0); assertStringInstr("σ", "Σ", "UTF8_BINARY", 0); @@ -954,6 +987,31 @@ public void testStringInstr() throws SparkException { assertStringInstr("Σ", "σ", "UNICODE_CI", 1); assertStringInstr("Σ", "ς", "UNICODE_CI", 1); assertStringInstr("Σ", "Σ", "UNICODE_CI", 1); + // Surrogate pairs. + assertStringInstr("a🙃b", "a", "UTF8_BINARY", 1); + assertStringInstr("a🙃b", "a", "UTF8_LCASE", 1); + assertStringInstr("a🙃b", "a", "UNICODE", 1); + assertStringInstr("a🙃b", "a", "UNICODE_CI", 1); + assertStringInstr("a🙃b", "🙃", "UTF8_BINARY", 2); + assertStringInstr("a🙃b", "🙃", "UTF8_LCASE", 2); + assertStringInstr("a🙃b", "🙃", "UNICODE", 2); + assertStringInstr("a🙃b", "🙃", "UNICODE_CI", 2); + assertStringInstr("a🙃b", "b", "UTF8_BINARY", 3); + assertStringInstr("a🙃b", "b", "UTF8_LCASE", 3); + assertStringInstr("a🙃b", "b", "UNICODE", 3); + assertStringInstr("a🙃b", "b", "UNICODE_CI", 3); + assertStringInstr("a🙃🙃b", "🙃", "UTF8_BINARY", 2); + assertStringInstr("a🙃🙃b", "🙃", "UTF8_LCASE", 2); + assertStringInstr("a🙃🙃b", "🙃", "UNICODE", 2); + assertStringInstr("a🙃🙃b", "🙃", "UNICODE_CI", 2); + assertStringInstr("a🙃🙃b", "b", "UTF8_BINARY", 4); + assertStringInstr("a🙃🙃b", "b", "UTF8_LCASE", 4); + assertStringInstr("a🙃🙃b", "b", "UNICODE", 4); + assertStringInstr("a🙃🙃b", "b", "UNICODE_CI", 4); + assertStringInstr("a🙃x🙃b", "b", "UTF8_BINARY", 5); + assertStringInstr("a🙃x🙃b", "b", "UTF8_LCASE", 5); + assertStringInstr("a🙃x🙃b", "b", "UNICODE", 5); + assertStringInstr("a🙃x🙃b", "b", "UNICODE_CI", 5); } private void assertFindInSet(String word, UTF8String set, String collationName, @@ -1185,118 +1243,288 @@ public void testReplace() throws SparkException { } - private void assertLocate(String substring, String string, Integer start, String collationName, - Integer expected) throws SparkException { + /** + * Verify the behaviour of the `StringLocate` collation support class. + */ + + private void assertStringLocate(String substring, String string, int start, + String collationName, int expected) throws SparkException { + // Note: When using start < 1, be careful to understand the behavior of the `indexOf` + // method and the implications of using `indexOf` in the `StringLocate` case class. UTF8String substr = UTF8String.fromString(substring); UTF8String str = UTF8String.fromString(string); int collationId = CollationFactory.collationNameToId(collationName); - assertEquals(expected, CollationSupport.StringLocate.exec(str, substr, - start - 1, collationId) + 1); + int result = CollationSupport.StringLocate.exec(str, substr, start - 1, collationId) + 1; + assertEquals(expected, result); } @Test - public void testLocate() throws SparkException { - // If you add tests with start < 1 be careful to understand the behavior of the indexOf method - // and usage of indexOf in the StringLocate class. - assertLocate("aa", "aaads", 1, "UTF8_BINARY", 1); - assertLocate("aa", "aaads", 2, "UTF8_BINARY", 2); - assertLocate("aa", "aaads", 3, "UTF8_BINARY", 0); - assertLocate("Aa", "aaads", 1, "UTF8_BINARY", 0); - assertLocate("Aa", "aAads", 1, "UTF8_BINARY", 2); - assertLocate("界x", "test大千世界X大千世界", 1, "UTF8_BINARY", 0); - assertLocate("界X", "test大千世界X大千世界", 1, "UTF8_BINARY", 8); - assertLocate("界", "test大千世界X大千世界", 13, "UTF8_BINARY", 13); - assertLocate("AA", "aaads", 1, "UTF8_LCASE", 1); - assertLocate("aa", "aAads", 2, "UTF8_LCASE", 2); - assertLocate("aa", "aaAds", 3, "UTF8_LCASE", 0); - assertLocate("abC", "abcabc", 1, "UTF8_LCASE", 1); - assertLocate("abC", "abCabc", 2, "UTF8_LCASE", 4); - assertLocate("abc", "abcabc", 4, "UTF8_LCASE", 4); - assertLocate("界x", "test大千世界X大千世界", 1, "UTF8_LCASE", 8); - assertLocate("界X", "test大千世界Xtest大千世界", 1, "UTF8_LCASE", 8); - assertLocate("界", "test大千世界X大千世界", 13, "UTF8_LCASE", 13); - assertLocate("大千", "test大千世界大千世界", 1, "UTF8_LCASE", 5); - assertLocate("大千", "test大千世界大千世界", 9, "UTF8_LCASE", 9); - assertLocate("大千", "大千世界大千世界", 1, "UTF8_LCASE", 1); - assertLocate("aa", "Aaads", 1, "UNICODE", 2); - assertLocate("AA", "aaads", 1, "UNICODE", 0); - assertLocate("aa", "aAads", 2, "UNICODE", 0); - assertLocate("aa", "aaAds", 3, "UNICODE", 0); - assertLocate("abC", "abcabc", 1, "UNICODE", 0); - assertLocate("abC", "abCabc", 2, "UNICODE", 0); - assertLocate("abC", "abCabC", 2, "UNICODE", 4); - assertLocate("abc", "abcabc", 1, "UNICODE", 1); - assertLocate("abc", "abcabc", 3, "UNICODE", 4); - assertLocate("界x", "test大千世界X大千世界", 1, "UNICODE", 0); - assertLocate("界X", "test大千世界X大千世界", 1, "UNICODE", 8); - assertLocate("界", "test大千世界X大千世界", 13, "UNICODE", 13); - assertLocate("AA", "aaads", 1, "UNICODE_CI", 1); - assertLocate("aa", "aAads", 2, "UNICODE_CI", 2); - assertLocate("aa", "aaAds", 3, "UNICODE_CI", 0); - assertLocate("abC", "abcabc", 1, "UNICODE_CI", 1); - assertLocate("abC", "abCabc", 2, "UNICODE_CI", 4); - assertLocate("abc", "abcabc", 4, "UNICODE_CI", 4); - assertLocate("界x", "test大千世界X大千世界", 1, "UNICODE_CI", 8); - assertLocate("界", "test大千世界X大千世界", 13, "UNICODE_CI", 13); - assertLocate("大千", "test大千世界大千世界", 1, "UNICODE_CI", 5); - assertLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9); - assertLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1); - // Case-variable character length - assertLocate("\u0307", "i̇", 1, "UTF8_BINARY", 2); - assertLocate("\u0307", "İ", 1, "UTF8_LCASE", 0); // != UTF8_BINARY - assertLocate("i", "i̇", 1, "UNICODE_CI", 0); - assertLocate("\u0307", "i̇", 1, "UNICODE_CI", 0); - assertLocate("i̇", "i", 1, "UNICODE_CI", 0); - assertLocate("İ", "i̇", 1, "UNICODE_CI", 1); - assertLocate("İ", "i", 1, "UNICODE_CI", 0); - assertLocate("i", "i̇", 1, "UTF8_LCASE", 1); // != UNICODE_CI - assertLocate("\u0307", "i̇", 1, "UTF8_LCASE", 2); // != UNICODE_CI - assertLocate("i̇", "i", 1, "UTF8_LCASE", 0); - assertLocate("İ", "i̇", 1, "UTF8_LCASE", 1); - assertLocate("İ", "i", 1, "UTF8_LCASE", 0); - assertLocate("i̇o", "İo世界大千世界", 1, "UNICODE_CI", 1); - assertLocate("i̇o", "大千İo世界大千世界", 1, "UNICODE_CI", 3); - assertLocate("i̇o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11); - assertLocate("İo", "i̇o世界大千世界", 1, "UNICODE_CI", 1); - assertLocate("İo", "大千i̇o世界大千世界", 1, "UNICODE_CI", 3); - assertLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12); - // Greek sigmas. - assertLocate("σ", "σ", 1, "UTF8_BINARY", 1); - assertLocate("σ", "ς", 1, "UTF8_BINARY", 0); - assertLocate("σ", "Σ", 1, "UTF8_BINARY", 0); - assertLocate("ς", "σ", 1, "UTF8_BINARY", 0); - assertLocate("ς", "ς", 1, "UTF8_BINARY", 1); - assertLocate("ς", "Σ", 1, "UTF8_BINARY", 0); - assertLocate("Σ", "σ", 1, "UTF8_BINARY", 0); - assertLocate("Σ", "ς", 1, "UTF8_BINARY", 0); - assertLocate("Σ", "Σ", 1, "UTF8_BINARY", 1); - assertLocate("σ", "σ", 1, "UTF8_LCASE", 1); - assertLocate("σ", "ς", 1, "UTF8_LCASE", 1); - assertLocate("σ", "Σ", 1, "UTF8_LCASE", 1); - assertLocate("ς", "σ", 1, "UTF8_LCASE", 1); - assertLocate("ς", "ς", 1, "UTF8_LCASE", 1); - assertLocate("ς", "Σ", 1, "UTF8_LCASE", 1); - assertLocate("Σ", "σ", 1, "UTF8_LCASE", 1); - assertLocate("Σ", "ς", 1, "UTF8_LCASE", 1); - assertLocate("Σ", "Σ", 1, "UTF8_LCASE", 1); - assertLocate("σ", "σ", 1, "UNICODE", 1); - assertLocate("σ", "ς", 1, "UNICODE", 0); - assertLocate("σ", "Σ", 1, "UNICODE", 0); - assertLocate("ς", "σ", 1, "UNICODE", 0); - assertLocate("ς", "ς", 1, "UNICODE", 1); - assertLocate("ς", "Σ", 1, "UNICODE", 0); - assertLocate("Σ", "σ", 1, "UNICODE", 0); - assertLocate("Σ", "ς", 1, "UNICODE", 0); - assertLocate("Σ", "Σ", 1, "UNICODE", 1); - assertLocate("σ", "σ", 1, "UNICODE_CI", 1); - assertLocate("σ", "ς", 1, "UNICODE_CI", 1); - assertLocate("σ", "Σ", 1, "UNICODE_CI", 1); - assertLocate("ς", "σ", 1, "UNICODE_CI", 1); - assertLocate("ς", "ς", 1, "UNICODE_CI", 1); - assertLocate("ς", "Σ", 1, "UNICODE_CI", 1); - assertLocate("Σ", "σ", 1, "UNICODE_CI", 1); - assertLocate("Σ", "ς", 1, "UNICODE_CI", 1); - assertLocate("Σ", "Σ", 1, "UNICODE_CI", 1); + public void testStringLocate() throws SparkException { + // Empty strings. + assertStringLocate("", "", -1, "UTF8_BINARY", 1); + assertStringLocate("", "", -1, "UTF8_LCASE", 1); + assertStringLocate("", "", -1, "UNICODE", 1); + assertStringLocate("", "", -1, "UNICODE_CI", 1); + assertStringLocate("", "", 0, "UTF8_BINARY", 1); + assertStringLocate("", "", 0, "UTF8_LCASE", 1); + assertStringLocate("", "", 0, "UNICODE", 1); + assertStringLocate("", "", 0, "UNICODE_CI", 1); + assertStringLocate("", "", 1, "UTF8_BINARY", 1); + assertStringLocate("", "", 1, "UTF8_LCASE", 1); + assertStringLocate("", "", 1, "UNICODE", 1); + assertStringLocate("", "", 1, "UNICODE_CI", 1); + assertStringLocate("a", "", -1, "UTF8_BINARY", 0); + assertStringLocate("a", "", -1, "UTF8_LCASE", 0); + assertStringLocate("a", "", -1, "UNICODE", 0); + assertStringLocate("a", "", -1, "UNICODE_CI", 0); + assertStringLocate("a", "", 0, "UTF8_BINARY", 0); + assertStringLocate("a", "", 0, "UTF8_LCASE", 0); + assertStringLocate("a", "", 0, "UNICODE", 0); + assertStringLocate("a", "", 0, "UNICODE_CI", 0); + assertStringLocate("a", "", 1, "UTF8_BINARY", 0); + assertStringLocate("a", "", 1, "UTF8_LCASE", 0); + assertStringLocate("a", "", 1, "UNICODE", 0); + assertStringLocate("a", "", 1, "UNICODE_CI", 0); + assertStringLocate("", "x", -1, "UTF8_BINARY", 1); + assertStringLocate("", "x", -1, "UTF8_LCASE", 1); + assertStringLocate("", "x", -1, "UNICODE", 1); + assertStringLocate("", "x", -1, "UNICODE_CI", 1); + assertStringLocate("", "x", 0, "UTF8_BINARY", 1); + assertStringLocate("", "x", 0, "UTF8_LCASE", 1); + assertStringLocate("", "x", 0, "UNICODE", 1); + assertStringLocate("", "x", 0, "UNICODE_CI", 1); + assertStringLocate("", "x", 1, "UTF8_BINARY", 1); + assertStringLocate("", "x", 1, "UTF8_LCASE", 1); + assertStringLocate("", "x", 1, "UNICODE", 1); + assertStringLocate("", "x", 1, "UNICODE_CI", 1); + // Basic tests. + assertStringLocate("aa", "aaads", 1, "UTF8_BINARY", 1); + assertStringLocate("aa", "aaads", 1, "UTF8_LCASE", 1); + assertStringLocate("aa", "aaads", 1, "UNICODE", 1); + assertStringLocate("aa", "aaads", 1, "UNICODE_CI", 1); + assertStringLocate("aa", "aaads", 2, "UTF8_BINARY", 2); + assertStringLocate("aa", "aaads", 2, "UTF8_LCASE", 2); + assertStringLocate("aa", "aaads", 2, "UNICODE", 2); + assertStringLocate("aa", "aaads", 2, "UNICODE_CI", 2); + assertStringLocate("aa", "aaads", 3, "UTF8_BINARY", 0); + assertStringLocate("aa", "aaads", 3, "UTF8_LCASE", 0); + assertStringLocate("aa", "aaads", 3, "UNICODE", 0); + assertStringLocate("aa", "aaads", 3, "UNICODE_CI", 0); + assertStringLocate("Aa", "aaads", 1, "UTF8_BINARY", 0); + assertStringLocate("Aa", "aaads", 1, "UTF8_LCASE", 1); + assertStringLocate("Aa", "aaads", 1, "UNICODE", 0); + assertStringLocate("Aa", "aaads", 1, "UNICODE_CI", 1); + assertStringLocate("Aa", "aaads", 2, "UTF8_BINARY", 0); + assertStringLocate("Aa", "aaads", 2, "UTF8_LCASE", 2); + assertStringLocate("Aa", "aaads", 2, "UNICODE", 0); + assertStringLocate("Aa", "aaads", 2, "UNICODE_CI", 2); + assertStringLocate("Aa", "aaads", 3, "UTF8_BINARY", 0); + assertStringLocate("Aa", "aaads", 3, "UTF8_LCASE", 0); + assertStringLocate("Aa", "aaads", 3, "UNICODE", 0); + assertStringLocate("Aa", "aaads", 3, "UNICODE_CI", 0); + assertStringLocate("Aa", "aAads", 1, "UTF8_BINARY", 2); + assertStringLocate("Aa", "aAads", 1, "UTF8_LCASE", 1); + assertStringLocate("Aa", "aAads", 1, "UNICODE", 2); + assertStringLocate("Aa", "aAads", 1, "UNICODE_CI", 1); + assertStringLocate("AA", "aaads", 1, "UTF8_BINARY", 0); + assertStringLocate("AA", "aaads", 1, "UTF8_LCASE", 1); + assertStringLocate("AA", "aaads", 1, "UNICODE", 0); + assertStringLocate("AA", "aaads", 1, "UNICODE_CI", 1); + assertStringLocate("aa", "aAads", 2, "UTF8_BINARY", 0); + assertStringLocate("aa", "aAads", 2, "UTF8_LCASE", 2); + assertStringLocate("aa", "aAads", 2, "UNICODE", 0); + assertStringLocate("aa", "aAads", 2, "UNICODE_CI", 2); + assertStringLocate("aa", "aaAds", 3, "UTF8_BINARY", 0); + assertStringLocate("aa", "aaAds", 3, "UTF8_LCASE", 0); + assertStringLocate("aa", "aaAds", 3, "UNICODE", 0); + assertStringLocate("aa", "aaAds", 3, "UNICODE_CI", 0); + assertStringLocate("abC", "abcabc", 1, "UTF8_BINARY", 0); + assertStringLocate("abC", "abcabc", 1, "UTF8_LCASE", 1); + assertStringLocate("abC", "abcabc", 1, "UNICODE", 0); + assertStringLocate("abC", "abcabc", 1, "UNICODE_CI", 1); + assertStringLocate("abC", "abCabc", 2, "UTF8_BINARY", 0); + assertStringLocate("abC", "abCabc", 2, "UTF8_LCASE", 4); + assertStringLocate("abC", "abCabc", 2, "UNICODE", 0); + assertStringLocate("abC", "abCabc", 2, "UNICODE_CI", 4); + assertStringLocate("abc", "abcabc", 1, "UTF8_BINARY", 1); + assertStringLocate("abc", "abcabc", 1, "UTF8_LCASE", 1); + assertStringLocate("abc", "abcabc", 1, "UNICODE", 1); + assertStringLocate("abc", "abcabc", 1, "UNICODE_CI", 1); + assertStringLocate("abc", "abcabc", 2, "UTF8_BINARY", 4); + assertStringLocate("abc", "abcabc", 2, "UTF8_LCASE", 4); + assertStringLocate("abc", "abcabc", 2, "UNICODE", 4); + assertStringLocate("abc", "abcabc", 2, "UNICODE_CI", 4); + assertStringLocate("abc", "abcabc", 3, "UTF8_BINARY", 4); + assertStringLocate("abc", "abcabc", 3, "UTF8_LCASE", 4); + assertStringLocate("abc", "abcabc", 3, "UNICODE", 4); + assertStringLocate("abc", "abcabc", 3, "UNICODE_CI", 4); + assertStringLocate("abc", "abcabc", 4, "UTF8_BINARY", 4); + assertStringLocate("abc", "abcabc", 4, "UTF8_LCASE", 4); + assertStringLocate("abc", "abcabc", 4, "UNICODE", 4); + assertStringLocate("abc", "abcabc", 4, "UNICODE_CI", 4); + assertStringLocate("aa", "Aaads", 1, "UTF8_BINARY", 2); + assertStringLocate("aa", "Aaads", 1, "UTF8_LCASE", 1); + assertStringLocate("aa", "Aaads", 1, "UNICODE", 2); + assertStringLocate("aa", "Aaads", 1, "UNICODE_CI", 1); + // Advanced tests. + assertStringLocate("界x", "test大千世界X大千世界", 1, "UTF8_BINARY", 0); + assertStringLocate("界X", "test大千世界X大千世界", 1, "UTF8_BINARY", 8); + assertStringLocate("界", "test大千世界X大千世界", 13, "UTF8_BINARY", 13); + assertStringLocate("界x", "test大千世界X大千世界", 1, "UTF8_LCASE", 8); + assertStringLocate("界X", "test大千世界Xtest大千世界", 1, "UTF8_LCASE", 8); + assertStringLocate("界", "test大千世界X大千世界", 13, "UTF8_LCASE", 13); + assertStringLocate("大千", "test大千世界大千世界", 1, "UTF8_LCASE", 5); + assertStringLocate("大千", "test大千世界大千世界", 9, "UTF8_LCASE", 9); + assertStringLocate("大千", "大千世界大千世界", 1, "UTF8_LCASE", 1); + assertStringLocate("界x", "test大千世界X大千世界", 1, "UNICODE", 0); + assertStringLocate("界X", "test大千世界X大千世界", 1, "UNICODE", 8); + assertStringLocate("界", "test大千世界X大千世界", 13, "UNICODE", 13); + assertStringLocate("界x", "test大千世界X大千世界", 1, "UNICODE_CI", 8); + assertStringLocate("界", "test大千世界X大千世界", 13, "UNICODE_CI", 13); + assertStringLocate("大千", "test大千世界大千世界", 1, "UNICODE_CI", 5); + assertStringLocate("大千", "test大千世界大千世界", 9, "UNICODE_CI", 9); + assertStringLocate("大千", "大千世界大千世界", 1, "UNICODE_CI", 1); + // One-to-many case mapping (e.g. Turkish dotted I). + assertStringLocate("\u0307", "i\u0307", 1, "UTF8_BINARY", 2); + assertStringLocate("\u0307", "İ", 1, "UTF8_LCASE", 0); // != UTF8_BINARY + assertStringLocate("i", "i\u0307", 1, "UNICODE_CI", 0); + assertStringLocate("\u0307", "i\u0307", 1, "UNICODE_CI", 0); + assertStringLocate("i\u0307", "i", 1, "UNICODE_CI", 0); + assertStringLocate("İ", "i\u0307", 1, "UNICODE_CI", 1); + assertStringLocate("İ", "i", 1, "UNICODE_CI", 0); + assertStringLocate("i", "i\u0307", 1, "UTF8_LCASE", 1); // != UNICODE_CI + assertStringLocate("\u0307", "i\u0307", 1, "UTF8_LCASE", 2); // != UNICODE_CI + assertStringLocate("i\u0307", "i", 1, "UTF8_LCASE", 0); + assertStringLocate("İ", "i\u0307", 1, "UTF8_LCASE", 1); + assertStringLocate("İ", "i", 1, "UTF8_LCASE", 0); + assertStringLocate("i\u0307o", "İo世界大千世界", 1, "UNICODE_CI", 1); + assertStringLocate("i\u0307o", "大千İo世界大千世界", 1, "UNICODE_CI", 3); + assertStringLocate("i\u0307o", "世界İo大千世界大千İo", 4, "UNICODE_CI", 11); + assertStringLocate("İo", "i̇o世界大千世界", 1, "UNICODE_CI", 1); + assertStringLocate("İo", "大千i̇o世界大千世界", 1, "UNICODE_CI", 3); + assertStringLocate("İo", "世界i̇o大千世界大千i̇o", 4, "UNICODE_CI", 12); + // Conditional case mapping (e.g. Greek sigmas). + assertStringLocate("σ", "σ", 1, "UTF8_BINARY", 1); + assertStringLocate("σ", "ς", 1, "UTF8_BINARY", 0); + assertStringLocate("σ", "Σ", 1, "UTF8_BINARY", 0); + assertStringLocate("ς", "σ", 1, "UTF8_BINARY", 0); + assertStringLocate("ς", "ς", 1, "UTF8_BINARY", 1); + assertStringLocate("ς", "Σ", 1, "UTF8_BINARY", 0); + assertStringLocate("Σ", "σ", 1, "UTF8_BINARY", 0); + assertStringLocate("Σ", "ς", 1, "UTF8_BINARY", 0); + assertStringLocate("Σ", "Σ", 1, "UTF8_BINARY", 1); + assertStringLocate("σ", "σ", 1, "UTF8_LCASE", 1); + assertStringLocate("σ", "ς", 1, "UTF8_LCASE", 1); + assertStringLocate("σ", "Σ", 1, "UTF8_LCASE", 1); + assertStringLocate("ς", "σ", 1, "UTF8_LCASE", 1); + assertStringLocate("ς", "ς", 1, "UTF8_LCASE", 1); + assertStringLocate("ς", "Σ", 1, "UTF8_LCASE", 1); + assertStringLocate("Σ", "σ", 1, "UTF8_LCASE", 1); + assertStringLocate("Σ", "ς", 1, "UTF8_LCASE", 1); + assertStringLocate("Σ", "Σ", 1, "UTF8_LCASE", 1); + assertStringLocate("σ", "σ", 1, "UNICODE", 1); + assertStringLocate("σ", "ς", 1, "UNICODE", 0); + assertStringLocate("σ", "Σ", 1, "UNICODE", 0); + assertStringLocate("ς", "σ", 1, "UNICODE", 0); + assertStringLocate("ς", "ς", 1, "UNICODE", 1); + assertStringLocate("ς", "Σ", 1, "UNICODE", 0); + assertStringLocate("Σ", "σ", 1, "UNICODE", 0); + assertStringLocate("Σ", "ς", 1, "UNICODE", 0); + assertStringLocate("Σ", "Σ", 1, "UNICODE", 1); + assertStringLocate("σ", "σ", 1, "UNICODE_CI", 1); + assertStringLocate("σ", "ς", 1, "UNICODE_CI", 1); + assertStringLocate("σ", "Σ", 1, "UNICODE_CI", 1); + assertStringLocate("ς", "σ", 1, "UNICODE_CI", 1); + assertStringLocate("ς", "ς", 1, "UNICODE_CI", 1); + assertStringLocate("ς", "Σ", 1, "UNICODE_CI", 1); + assertStringLocate("Σ", "σ", 1, "UNICODE_CI", 1); + assertStringLocate("Σ", "ς", 1, "UNICODE_CI", 1); + assertStringLocate("Σ", "Σ", 1, "UNICODE_CI", 1); + // Surrogate pairs. + assertStringLocate("a", "a🙃b", 1, "UTF8_BINARY", 1); + assertStringLocate("a", "a🙃b", 1, "UTF8_LCASE", 1); + assertStringLocate("a", "a🙃b", 1, "UNICODE", 1); + assertStringLocate("a", "a🙃b", 1, "UNICODE_CI", 1); + assertStringLocate("a", "a🙃b", 2, "UTF8_BINARY", 0); + assertStringLocate("a", "a🙃b", 2, "UTF8_LCASE", 0); + assertStringLocate("a", "a🙃b", 2, "UNICODE", 0); + assertStringLocate("a", "a🙃b", 2, "UNICODE_CI", 0); + assertStringLocate("a", "a🙃b", 3, "UTF8_BINARY", 0); + assertStringLocate("a", "a🙃b", 3, "UTF8_LCASE", 0); + assertStringLocate("a", "a🙃b", 3, "UNICODE", 0); + assertStringLocate("a", "a🙃b", 3, "UNICODE_CI", 0); + assertStringLocate("🙃", "a🙃b", 1, "UTF8_BINARY", 2); + assertStringLocate("🙃", "a🙃b", 1, "UTF8_LCASE", 2); + assertStringLocate("🙃", "a🙃b", 1, "UNICODE", 2); + assertStringLocate("🙃", "a🙃b", 1, "UNICODE_CI", 2); + assertStringLocate("🙃", "a🙃b", 2, "UTF8_BINARY", 2); + assertStringLocate("🙃", "a🙃b", 2, "UTF8_LCASE", 2); + assertStringLocate("🙃", "a🙃b", 2, "UNICODE", 2); + assertStringLocate("🙃", "a🙃b", 2, "UNICODE_CI", 2); + assertStringLocate("🙃", "a🙃b", 3, "UTF8_BINARY", 0); + assertStringLocate("🙃", "a🙃b", 3, "UTF8_LCASE", 0); + assertStringLocate("🙃", "a🙃b", 3, "UNICODE", 0); + assertStringLocate("🙃", "a🙃b", 3, "UNICODE_CI", 0); + assertStringLocate("b", "a🙃b", 1, "UTF8_BINARY", 3); + assertStringLocate("b", "a🙃b", 1, "UTF8_LCASE", 3); + assertStringLocate("b", "a🙃b", 1, "UNICODE", 3); + assertStringLocate("b", "a🙃b", 1, "UNICODE_CI", 3); + assertStringLocate("b", "a🙃b", 2, "UTF8_BINARY", 3); + assertStringLocate("b", "a🙃b", 2, "UTF8_LCASE", 3); + assertStringLocate("b", "a🙃b", 2, "UNICODE", 3); + assertStringLocate("b", "a🙃b", 2, "UNICODE_CI", 3); + assertStringLocate("b", "a🙃b", 3, "UTF8_BINARY", 3); + assertStringLocate("b", "a🙃b", 3, "UTF8_LCASE", 3); + assertStringLocate("b", "a🙃b", 3, "UNICODE", 3); + assertStringLocate("b", "a🙃b", 3, "UNICODE_CI", 3); + assertStringLocate("🙃", "a🙃🙃b", 1, "UTF8_BINARY", 2); + assertStringLocate("🙃", "a🙃🙃b", 1, "UTF8_LCASE", 2); + assertStringLocate("🙃", "a🙃🙃b", 1, "UNICODE", 2); + assertStringLocate("🙃", "a🙃🙃b", 1, "UNICODE_CI", 2); + assertStringLocate("🙃", "a🙃🙃b", 2, "UTF8_BINARY", 2); + assertStringLocate("🙃", "a🙃🙃b", 2, "UTF8_LCASE", 2); + assertStringLocate("🙃", "a🙃🙃b", 2, "UNICODE", 2); + assertStringLocate("🙃", "a🙃🙃b", 2, "UNICODE_CI", 2); + assertStringLocate("🙃", "a🙃🙃b", 3, "UTF8_BINARY", 3); + assertStringLocate("🙃", "a🙃🙃b", 3, "UTF8_LCASE", 3); + assertStringLocate("🙃", "a🙃🙃b", 3, "UNICODE", 3); + assertStringLocate("🙃", "a🙃🙃b", 3, "UNICODE_CI", 3); + assertStringLocate("🙃", "a🙃🙃b", 4, "UTF8_BINARY", 0); + assertStringLocate("🙃", "a🙃🙃b", 4, "UTF8_LCASE", 0); + assertStringLocate("🙃", "a🙃🙃b", 4, "UNICODE", 0); + assertStringLocate("🙃", "a🙃🙃b", 4, "UNICODE_CI", 0); + assertStringLocate("b", "a🙃🙃b", 1, "UTF8_BINARY", 4); + assertStringLocate("b", "a🙃🙃b", 1, "UTF8_LCASE", 4); + assertStringLocate("b", "a🙃🙃b", 1, "UNICODE", 4); + assertStringLocate("b", "a🙃🙃b", 1, "UNICODE_CI", 4); + assertStringLocate("b", "a🙃🙃b", 2, "UTF8_BINARY", 4); + assertStringLocate("b", "a🙃🙃b", 2, "UTF8_LCASE", 4); + assertStringLocate("b", "a🙃🙃b", 2, "UNICODE", 4); + assertStringLocate("b", "a🙃🙃b", 2, "UNICODE_CI", 4); + assertStringLocate("b", "a🙃🙃b", 3, "UTF8_BINARY", 4); + assertStringLocate("b", "a🙃🙃b", 3, "UTF8_LCASE", 4); + assertStringLocate("b", "a🙃🙃b", 3, "UNICODE", 4); + assertStringLocate("b", "a🙃🙃b", 3, "UNICODE_CI", 4); + assertStringLocate("b", "a🙃🙃b", 4, "UTF8_BINARY", 4); + assertStringLocate("b", "a🙃🙃b", 4, "UTF8_LCASE", 4); + assertStringLocate("b", "a🙃🙃b", 4, "UNICODE", 4); + assertStringLocate("b", "a🙃🙃b", 4, "UNICODE_CI", 4); + assertStringLocate("b", "a🙃x🙃b", 1, "UTF8_BINARY", 5); + assertStringLocate("b", "a🙃x🙃b", 1, "UTF8_LCASE", 5); + assertStringLocate("b", "a🙃x🙃b", 1, "UNICODE", 5); + assertStringLocate("b", "a🙃x🙃b", 1, "UNICODE_CI", 5); + assertStringLocate("b", "a🙃x🙃b", 2, "UTF8_BINARY", 5); + assertStringLocate("b", "a🙃x🙃b", 2, "UTF8_LCASE", 5); + assertStringLocate("b", "a🙃x🙃b", 2, "UNICODE", 5); + assertStringLocate("b", "a🙃x🙃b", 2, "UNICODE_CI", 5); + assertStringLocate("b", "a🙃x🙃b", 3, "UTF8_BINARY", 5); + assertStringLocate("b", "a🙃x🙃b", 3, "UTF8_LCASE", 5); + assertStringLocate("b", "a🙃x🙃b", 3, "UNICODE", 5); + assertStringLocate("b", "a🙃x🙃b", 3, "UNICODE_CI", 5); + assertStringLocate("b", "a🙃x🙃b", 4, "UTF8_BINARY", 5); + assertStringLocate("b", "a🙃x🙃b", 4, "UTF8_LCASE", 5); + assertStringLocate("b", "a🙃x🙃b", 4, "UNICODE", 5); + assertStringLocate("b", "a🙃x🙃b", 4, "UNICODE_CI", 5); } private void assertSubstringIndex(String string, String delimiter, Integer count,