From 0b106aadb229c2f31c939c10817f9094d9e5c8b6 Mon Sep 17 00:00:00 2001 From: Stevo Mitric Date: Mon, 26 Aug 2024 13:51:45 +0200 Subject: [PATCH 1/3] fix out of bounds exception --- .../sql/catalyst/util/CollationAwareUTF8String.java | 5 +++++ .../spark/unsafe/types/CollationSupportSuite.java | 13 +++++++++++++ 2 files changed, 18 insertions(+) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 9f26cc0bac21..6b8bf354236b 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -671,6 +671,11 @@ public static int indexOf(final UTF8String target, final UTF8String pattern, // Initialize the string search with respect to the specified ICU collation. String targetStr = target.toValidString(); String patternStr = pattern.toValidString(); + // Check if `start` is out of bounds. The provided offset `start` is given in number of + // codepoints, so a simple `targetStr.length` check is not sufficient here. This check is + // needed because `String.offsetByCodePoints` throws an `IndexOutOfBoundsException` + // exception when the offset is out of bounds. + if (targetStr.codePointCount(0, targetStr.length()) <= start) return MATCH_NOT_FOUND; StringSearch stringSearch = CollationFactory.getStringSearch(targetStr, patternStr, collationId); stringSearch.setOverlapping(true); diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 202d73097453..8b107908467b 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -2329,6 +2329,19 @@ public void testStringLocate() throws SparkException { assertStringLocate("b", "a🙃x🙃b", 4, "UTF8_LCASE", 5); assertStringLocate("b", "a🙃x🙃b", 4, "UNICODE", 5); assertStringLocate("b", "a🙃x🙃b", 4, "UNICODE_CI", 5); + // Out of bounds test cases. + assertStringLocate("a", "asd", 4, "UTF8_BINARY", 0); + assertStringLocate("a", "asd", 4, "UTF8_LCASE", 0); + assertStringLocate("a", "asd", 4, "UNICODE", 0); + assertStringLocate("a", "asd", 4, "UNICODE_CI", 0); + assertStringLocate("a", "asd", 100, "UTF8_BINARY", 0); + assertStringLocate("a", "asd", 100, "UTF8_LCASE", 0); + assertStringLocate("a", "asd", 100, "UNICODE", 0); + assertStringLocate("a", "asd", 100, "UNICODE_CI", 0); + assertStringLocate("a", "🙃🙃", 4, "UTF8_BINARY", 0); + assertStringLocate("a", "🙃🙃", 4, "UTF8_LCASE", 0); + assertStringLocate("a", "🙃🙃", 4, "UNICODE", 0); + assertStringLocate("a", "🙃🙃", 4, "UNICODE_CI", 0); } /** From 1a47f75f45a8432cfd1eca85dbebc333a3d741f0 Mon Sep 17 00:00:00 2001 From: Stevo Mitric Date: Mon, 26 Aug 2024 17:20:55 +0200 Subject: [PATCH 2/3] add empty substring tests --- .../org/apache/spark/unsafe/types/CollationSupportSuite.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 8b107908467b..a10e73487508 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -2342,6 +2342,10 @@ public void testStringLocate() throws SparkException { assertStringLocate("a", "🙃🙃", 4, "UTF8_LCASE", 0); assertStringLocate("a", "🙃🙃", 4, "UNICODE", 0); assertStringLocate("a", "🙃🙃", 4, "UNICODE_CI", 0); + assertStringLocate("", "asd", 100, "UTF8_BINARY", 1); + assertStringLocate("", "asd", 100, "UTF8_LCASE", 1); + assertStringLocate("", "asd", 100, "UNICODE", 1); + assertStringLocate("", "asd", 100, "UNICODE_CI", 1); } /** From 76fef084d68c45a40dfef2d27d5cd557e8ea58a5 Mon Sep 17 00:00:00 2001 From: Stevo Mitric Date: Mon, 26 Aug 2024 17:22:03 +0200 Subject: [PATCH 3/3] add empty string tests --- .../org/apache/spark/unsafe/types/CollationSupportSuite.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index a10e73487508..68e20d82fedc 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -2346,6 +2346,10 @@ public void testStringLocate() throws SparkException { assertStringLocate("", "asd", 100, "UTF8_LCASE", 1); assertStringLocate("", "asd", 100, "UNICODE", 1); assertStringLocate("", "asd", 100, "UNICODE_CI", 1); + assertStringLocate("asd", "", 100, "UTF8_BINARY", 0); + assertStringLocate("asd", "", 100, "UTF8_LCASE", 0); + assertStringLocate("asd", "", 100, "UNICODE", 0); + assertStringLocate("asd", "", 100, "UNICODE_CI", 0); } /**