diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index c81dc10f8f5eb..6a3e9d313b59a 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -1000,6 +1000,21 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) { } public UTF8String[] split(UTF8String pattern, int limit) { + // For the empty `pattern` a `split` function ignores trailing empty strings unless original + // string is empty. + if (numBytes() != 0 && pattern.numBytes() == 0) { + int newLimit = limit > numChars() || limit <= 0 ? numChars() : limit; + byte[] input = getBytes(); + int byteIndex = 0; + int charIndex = 0; + UTF8String[] result = new UTF8String[newLimit]; + while (charIndex < newLimit) { + int currCharNumBytes = numBytesForFirstByte(input[byteIndex]); + result[charIndex++] = UTF8String.fromBytes(input, byteIndex, currCharNumBytes); + byteIndex += currCharNumBytes; + } + return result; + } return split(pattern.toString(), limit); } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index f530c81df269e..8411d448dc9fd 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -409,6 +409,25 @@ public void split() { assertArrayEquals( new UTF8String[]{fromString("ab"), fromString("def,ghi,")}, fromString("ab,def,ghi,").split(fromString(","), 2)); + // Split with empty pattern ignores trailing empty spaces. + assertArrayEquals( + new UTF8String[]{fromString("a"), fromString("b")}, + fromString("ab").split(fromString(""), 0)); + assertArrayEquals( + new UTF8String[]{fromString("a"), fromString("b")}, + fromString("ab").split(fromString(""), -1)); + assertArrayEquals( + new UTF8String[]{fromString("a"), fromString("b")}, + fromString("ab").split(fromString(""), 2)); + assertArrayEquals( + new UTF8String[]{fromString("a"), fromString("b")}, + fromString("ab").split(fromString(""), 100)); + assertArrayEquals( + new UTF8String[]{fromString("a")}, + fromString("ab").split(fromString(""), 1)); + assertArrayEquals( + new UTF8String[]{fromString("")}, + fromString("").split(fromString(""), 0)); } @Test diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index ecd850e11a8ac..d69f245d8e8e3 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -28,6 +28,7 @@ license: | - Since Spark 3.4, v1 database, table, permanent view and function identifier will include 'spark_catalog' as the catalog name if database is defined, e.g. a table identifier will be: `spark_catalog.default.t`. To restore the legacy behavior, set `spark.sql.legacy.v1IdentifierNoCatalog` to `true`. - Since Spark 3.4, when ANSI SQL mode(configuration `spark.sql.ansi.enabled`) is on, Spark SQL always returns NULL result on getting a map value with a non-existing key. In Spark 3.3 or earlier, there will be an error. - Since Spark 3.4, the SQL CLI `spark-sql` does not print the prefix `Error in query:` before the error message of `AnalysisException`. + - Since Spark 3.4, `split` function ignores trailing empty strings when `regex` parameter is empty. ## Upgrading from Spark SQL 3.2 to 3.3 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 4e474cbf8cd37..500c040dfe4e9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -527,7 +527,7 @@ case class StringSplit(str: Expression, regex: Expression, limit: Expression) override def second: Expression = regex override def third: Expression = limit - def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1)); + def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1)) override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = { val strings = string.asInstanceOf[UTF8String].split( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 91b0f0e1039f6..9089963ee852c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -461,6 +461,21 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { StringSplit(s1, s2, -1), Seq("aa", "bb", "cc"), row1) checkEvaluation(StringSplit(s1, s2, -1), null, row2) checkEvaluation(StringSplit(s1, s2, -1), null, row3) + // Empty regex + checkEvaluation( + StringSplit(Literal("hello"), Literal(""), 0), Seq("h", "e", "l", "l", "o"), row1) + checkEvaluation( + StringSplit(Literal("hello"), Literal(""), -1), Seq("h", "e", "l", "l", "o"), row1) + checkEvaluation( + StringSplit(Literal("hello"), Literal(""), 5), Seq("h", "e", "l", "l", "o"), row1) + checkEvaluation( + StringSplit(Literal("hello"), Literal(""), 3), Seq("h", "e", "l"), row1) + checkEvaluation( + StringSplit(Literal("hello"), Literal(""), 100), Seq("h", "e", "l", "l", "o"), row1) + checkEvaluation( + StringSplit(Literal(""), Literal(""), -1), Seq(""), row1) + checkEvaluation( + StringSplit(Literal(""), Literal(""), 0), Seq(""), row1) // Test escaping of arguments GenerateUnsafeProjection.generate( diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 058ea89179786..efbef2ab449ba 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -26,6 +26,10 @@ select right("abcd", -2), right("abcd", 0), right("abcd", 'a'); -- split function SELECT split('aa1cc2ee3', '[1-9]+'); SELECT split('aa1cc2ee3', '[1-9]+', 2); +SELECT split('hello', ''); +SELECT split('', ''); +SELECT split('abc', null); +SELECT split(null, 'b'); -- split_part function SELECT split_part('11.12.13', '.', 2); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out index c6bbb4fb7179a..c06e825a36def 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out @@ -274,4 +274,4 @@ select aggregate(split('abcdefgh',''), array(array('')), (acc, x) -> array(array -- !query schema struct>> -- !query output -[[""]] +[["h"]] diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index add89a635a83f..548c52c96620b 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -155,6 +155,38 @@ struct> ["aa","cc2ee3"] +-- !query +SELECT split('hello', '') +-- !query schema +struct> +-- !query output +["h","e","l","l","o"] + + +-- !query +SELECT split('', '') +-- !query schema +struct> +-- !query output +[""] + + +-- !query +SELECT split('abc', null) +-- !query schema +struct> +-- !query output +NULL + + +-- !query +SELECT split(null, 'b') +-- !query schema +struct> +-- !query output +NULL + + -- !query SELECT split_part('11.12.13', '.', 2) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out index c6bbb4fb7179a..c06e825a36def 100644 --- a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out @@ -274,4 +274,4 @@ select aggregate(split('abcdefgh',''), array(array('')), (acc, x) -> array(array -- !query schema struct>> -- !query output -[[""]] +[["h"]] diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index dedbd29d4bba1..f7c8689e800cd 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -121,6 +121,38 @@ struct> ["aa","cc2ee3"] +-- !query +SELECT split('hello', '') +-- !query schema +struct> +-- !query output +["h","e","l","l","o"] + + +-- !query +SELECT split('', '') +-- !query schema +struct> +-- !query output +[""] + + +-- !query +SELECT split('abc', null) +-- !query schema +struct> +-- !query output +NULL + + +-- !query +SELECT split(null, 'b') +-- !query schema +struct> +-- !query output +NULL + + -- !query SELECT split_part('11.12.13', '.', 2) -- !query schema