Skip to content
Closed
Original file line number Diff line number Diff line change
Expand Up @@ -1000,6 +1000,21 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) {
}

public UTF8String[] split(UTF8String pattern, int limit) {
// For the empty `pattern` a `split` function ignores trailing empty strings unless original
// string is empty.
if (numBytes() != 0 && pattern.numBytes() == 0) {
int newLimit = limit > numChars() || limit <= 0 ? numChars() : limit;
byte[] input = getBytes();
int byteIndex = 0;
int charIndex = 0;
UTF8String[] result = new UTF8String[newLimit];
while (charIndex < newLimit) {
int currCharNumBytes = numBytesForFirstByte(input[byteIndex]);
result[charIndex++] = UTF8String.fromBytes(input, byteIndex, currCharNumBytes);
byteIndex += currCharNumBytes;
}
return result;
}
return split(pattern.toString(), limit);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,25 @@ public void split() {
assertArrayEquals(
new UTF8String[]{fromString("ab"), fromString("def,ghi,")},
fromString("ab,def,ghi,").split(fromString(","), 2));
// Split with empty pattern ignores trailing empty spaces.
assertArrayEquals(
new UTF8String[]{fromString("a"), fromString("b")},
fromString("ab").split(fromString(""), 0));
assertArrayEquals(
new UTF8String[]{fromString("a"), fromString("b")},
fromString("ab").split(fromString(""), -1));
assertArrayEquals(
new UTF8String[]{fromString("a"), fromString("b")},
fromString("ab").split(fromString(""), 2));
assertArrayEquals(
new UTF8String[]{fromString("a"), fromString("b")},
fromString("ab").split(fromString(""), 100));
assertArrayEquals(
new UTF8String[]{fromString("a")},
fromString("ab").split(fromString(""), 1));
assertArrayEquals(
new UTF8String[]{fromString("")},
fromString("").split(fromString(""), 0));
}

@Test
Expand Down
1 change: 1 addition & 0 deletions docs/sql-migration-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ license: |
- Since Spark 3.4, v1 database, table, permanent view and function identifier will include 'spark_catalog' as the catalog name if database is defined, e.g. a table identifier will be: `spark_catalog.default.t`. To restore the legacy behavior, set `spark.sql.legacy.v1IdentifierNoCatalog` to `true`.
- Since Spark 3.4, when ANSI SQL mode(configuration `spark.sql.ansi.enabled`) is on, Spark SQL always returns NULL result on getting a map value with a non-existing key. In Spark 3.3 or earlier, there will be an error.
- Since Spark 3.4, the SQL CLI `spark-sql` does not print the prefix `Error in query:` before the error message of `AnalysisException`.
- Since Spark 3.4, `split` function ignores trailing empty strings when `regex` parameter is empty.

## Upgrading from Spark SQL 3.2 to 3.3

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ case class StringSplit(str: Expression, regex: Expression, limit: Expression)
override def second: Expression = regex
override def third: Expression = limit

def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1));
def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1))

override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
val strings = string.asInstanceOf[UTF8String].split(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,21 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
StringSplit(s1, s2, -1), Seq("aa", "bb", "cc"), row1)
checkEvaluation(StringSplit(s1, s2, -1), null, row2)
checkEvaluation(StringSplit(s1, s2, -1), null, row3)
// Empty regex
checkEvaluation(
StringSplit(Literal("hello"), Literal(""), 0), Seq("h", "e", "l", "l", "o"), row1)
checkEvaluation(
StringSplit(Literal("hello"), Literal(""), -1), Seq("h", "e", "l", "l", "o"), row1)
checkEvaluation(
StringSplit(Literal("hello"), Literal(""), 5), Seq("h", "e", "l", "l", "o"), row1)
checkEvaluation(
StringSplit(Literal("hello"), Literal(""), 3), Seq("h", "e", "l"), row1)
checkEvaluation(
StringSplit(Literal("hello"), Literal(""), 100), Seq("h", "e", "l", "l", "o"), row1)
checkEvaluation(
StringSplit(Literal(""), Literal(""), -1), Seq(""), row1)
checkEvaluation(
StringSplit(Literal(""), Literal(""), 0), Seq(""), row1)

// Test escaping of arguments
GenerateUnsafeProjection.generate(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ select right("abcd", -2), right("abcd", 0), right("abcd", 'a');
-- split function
SELECT split('aa1cc2ee3', '[1-9]+');
SELECT split('aa1cc2ee3', '[1-9]+', 2);
SELECT split('hello', '');
SELECT split('', '');
SELECT split('abc', null);
SELECT split(null, 'b');

-- split_part function
SELECT split_part('11.12.13', '.', 2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,4 +274,4 @@ select aggregate(split('abcdefgh',''), array(array('')), (acc, x) -> array(array
-- !query schema
struct<aggregate(split(abcdefgh, , -1), array(array()), lambdafunction(array(array(namedlambdavariable())), namedlambdavariable(), namedlambdavariable()), lambdafunction(namedlambdavariable(), namedlambdavariable())):array<array<string>>>
-- !query output
[[""]]
[["h"]]
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,38 @@ struct<split(aa1cc2ee3, [1-9]+, 2):array<string>>
["aa","cc2ee3"]


-- !query
SELECT split('hello', '')
-- !query schema
struct<split(hello, , -1):array<string>>
-- !query output
["h","e","l","l","o"]


-- !query
SELECT split('', '')
-- !query schema
struct<split(, , -1):array<string>>
-- !query output
[""]


-- !query
SELECT split('abc', null)
-- !query schema
struct<split(abc, NULL, -1):array<string>>
-- !query output
NULL


-- !query
SELECT split(null, 'b')
-- !query schema
struct<split(NULL, b, -1):array<string>>
-- !query output
NULL


-- !query
SELECT split_part('11.12.13', '.', 2)
-- !query schema
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,4 +274,4 @@ select aggregate(split('abcdefgh',''), array(array('')), (acc, x) -> array(array
-- !query schema
struct<aggregate(split(abcdefgh, , -1), array(array()), lambdafunction(array(array(namedlambdavariable())), namedlambdavariable(), namedlambdavariable()), lambdafunction(namedlambdavariable(), namedlambdavariable())):array<array<string>>>
-- !query output
[[""]]
[["h"]]
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,38 @@ struct<split(aa1cc2ee3, [1-9]+, 2):array<string>>
["aa","cc2ee3"]


-- !query
SELECT split('hello', '')
-- !query schema
struct<split(hello, , -1):array<string>>
-- !query output
["h","e","l","l","o"]


-- !query
SELECT split('', '')
-- !query schema
struct<split(, , -1):array<string>>
-- !query output
[""]


-- !query
SELECT split('abc', null)
-- !query schema
struct<split(abc, NULL, -1):array<string>>
-- !query output
NULL


-- !query
SELECT split(null, 'b')
-- !query schema
struct<split(NULL, b, -1):array<string>>
-- !query output
NULL


-- !query
SELECT split_part('11.12.13', '.', 2)
-- !query schema
Expand Down