Skip to content
Closed
Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -3404,19 +3404,27 @@ setMethod("collect_set",
#' Equivalent to \code{split} SQL function.
#'
#' @rdname column_string_functions
#' @param limit determines the length of the returned array.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we mention this is an optional param?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

going to include this in the @details section, as other functions like ltrim handle optionality of one of its params there.

#' \itemize{
#' \item \code{limit > 0}: length of the array will be at most \code{limit}
#' \item \code{limit <= 0}: the returned array can have any length
#' }
#'
#' @aliases split_string split_string,Column-method
#' @examples
#'
#' \dontrun{
#' head(select(df, split_string(df$Sex, "a")))
#' head(select(df, split_string(df$Class, "\\d")))
#' head(select(df, split_string(df$Class, "\\d", 2)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add documentation for R side too. Please document limit here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current build failure:

Undocumented arguments in documentation object 'column_string_functions'
  'limit'

Functions with \usage entries need to have the appropriate \alias
entries, and all their arguments documented.
The \usage entries must correspond to syntactically valid R code.
See the chapter 'Writing R documentation files' in the 'Writing R

#' # This is equivalent to the following SQL expression
#' head(selectExpr(df, "split(Class, '\\\\d')"))}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm i think L3418 shall be followed by L3420?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point - also the example should run in the order documented.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes will make that change @viirya @felixcheung

#' @note split_string 2.3.0
setMethod("split_string",
signature(x = "Column", pattern = "character"),
function(x, pattern) {
jc <- callJStatic("org.apache.spark.sql.functions", "split", x@jc, pattern)
function(x, pattern, limit = -1) {
jc <- callJStatic("org.apache.spark.sql.functions",
"split", x@jc, pattern, as.integer(limit))
column(jc)
})

Expand Down
2 changes: 1 addition & 1 deletion R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -1242,7 +1242,7 @@ setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array")

#' @rdname column_string_functions
#' @name NULL
setGeneric("split_string", function(x, pattern) { standardGeneric("split_string") })
setGeneric("split_string", function(x, pattern, ...) { standardGeneric("split_string") })

#' @rdname column_string_functions
#' @name NULL
Expand Down
4 changes: 4 additions & 0 deletions R/pkg/tests/fulltests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1803,6 +1803,10 @@ test_that("string operators", {
collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
list(list("[email protected] 1", "b"))
)
expect_equal(
collect(select(df4, split_string(df4$a, "\\.", 2)))[1, 1],
list(list("a", "[email protected] 1\\b"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add a test for limit = 0 or limit = -1 too - while it's the default value, is any of the test cases changes behavior for limit = -1?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added for limit = 0 to catch the "change behavior" case

)

l5 <- list(list(a = "abc"))
df5 <- createDataFrame(l5)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,12 @@ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) {
}

public UTF8String[] split(UTF8String pattern, int limit) {
// Java String's split method supports "ignore empty string" behavior when the limit is 0
// whereas other languages do not. To avoid this java specific behavior, we fall back to
// -1 when the limit is 0.
if (limit == 0) {
limit = -1;
}
String[] splits = toString().split(pattern.toString(), limit);
UTF8String[] res = new UTF8String[splits.length];
for (int i = 0; i < res.length; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,12 +394,14 @@ public void substringSQL() {

@Test
public void split() {
assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), -1),
new UTF8String[]{fromString("ab"), fromString("def"), fromString("ghi")}));
assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
new UTF8String[]{fromString("ab"), fromString("def,ghi")}));
assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
new UTF8String[]{fromString("ab"), fromString("def,ghi")}));
UTF8String[] negativeAndZeroLimitCase =
new UTF8String[]{fromString("ab"), fromString("def"), fromString("ghi"), fromString("")};
assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), 0),
negativeAndZeroLimitCase));
assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), -1),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why should we change the existing tests? Just add one test to check

    if (limit == 0) {
      limit = -1;
    }

Copy link
Author

@phegstrom phegstrom Sep 4, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@HyukjinKwon the last two were duplicates:

    assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
      new UTF8String[]{fromString("ab"), fromString("def,ghi")}));
    assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
      new UTF8String[]{fromString("ab"), fromString("def,ghi")}));

And I also thought it better to include the case where you do get an empty string (adding one more instance of the regex at the end). Want me to revert? My view is it's more exhaustive of the expected behavior, and also easier to see that limit = -1 should behave exactly like limit = 0.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's fix the indentation to show less diff.

negativeAndZeroLimitCase));
assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), 2),
new UTF8String[]{fromString("ab"), fromString("def,ghi,")}));
}

@Test
Expand Down
28 changes: 21 additions & 7 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1671,18 +1671,32 @@ def repeat(col, n):

@since(1.5)
@ignore_unicode_prefix
def split(str, pattern):
def split(str, pattern, limit=-1):
"""
Splits str around pattern (pattern is a regular expression).
Splits str around matches of the given pattern.

.. note:: pattern is a string represent the regular expression.
:param str: a string expression to split
:param pattern: a string representing a regular expression. The regex string should be
a Java regular expression.
:param limit: an integer which controls the number of times `pattern` is applied.

>>> df = spark.createDataFrame([('ab12cd',)], ['s',])
>>> df.select(split(df.s, '[0-9]+').alias('s')).collect()
[Row(s=[u'ab', u'cd'])]
* ``limit > 0``: The resulting array's length will not be more than `limit`, and the
resulting array's last entry will contain all input beyond the last
matched pattern.
* ``limit <= 0``: `pattern` will be applied as many times as possible, and the resulting
array can be of any size.

.. versionchanged:: 3.0
`split` now takes an optional `limit` field. If not provided, default limit value is -1.

>>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',])
>>> df.select(split(df.s, '[ABC]', 2).alias('s')).collect()
[Row(s=[u'one', u'twoBthreeC'])]
>>> df.select(split(df.s, '[ABC]', -1).alias('s')).collect()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's turn into this an example without limit argument.

[Row(s=[u'one', u'two', u'three', u''])]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.split(_to_java_column(str), pattern))
return Column(sc._jvm.functions.split(_to_java_column(str), pattern, limit))


@ignore_unicode_prefix
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ case class Like(left: Expression, right: Expression) extends StringRegexExpressi
arguments = """
Arguments:
* str - a string expression
* regexp - a string expression. The pattern string should be a Java regular expression.
* regexp - a string expression. The regex string should be a Java regular expression.

Since Spark 2.0, string literals (including regex patterns) are unescaped in our SQL
parser. For example, to match "\abc", a regular expression for `regexp` can be
Expand Down Expand Up @@ -229,33 +229,53 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress


/**
* Splits str around pat (pattern is a regular expression).
* Splits str around matches of the given regex.
*/
@ExpressionDescription(
usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match `regex`.",
usage = "_FUNC_(str, regex, limit) - Splits `str` around occurrences that match `regex`" +
" and returns an array with a length of at most `limit`",
arguments = """
Arguments:
* str - a string expression to split.
* regex - a string representing a regular expression. The regex string should be a
Java regular expression.
* limit - an integer expression which controls the number of times the regex is applied.
* limit > 0: The resulting array's length will not be more than `limit`,
and the resulting array's last entry will contain all input
beyond the last matched regex.
* limit <= 0: `regex` will be applied as many times as possible, and
the resulting array can be of any size.
""",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about this formatting?;


 function_desc | Extended Usage:
    Arguments:
      * str - a string expression to split.
      * pattern - a string representing a regular expression. The pattern string should be a
        Java regular expression.
      * limit - an integer expression which controls the number of times the pattern is applied.

        limit > 0: The resulting array's length will not be more than `limit`, and the resulting array's
                   last entry will contain all input beyond the last matched pattern.
        limit < 0: `pattern` will be applied as many times as possible, and the resulting
                   array can be of any size.
        limit = 0: `pattern` will be applied as many times as possible, the resulting array can
                   be of any size, and trailing empty strings will be discarded.
  
    Examples:
      > SELECT split('oneAtwoBthreeC', '[ABC]');
       ["one","two","three",""]
      > SELECT split('oneAtwoBthreeC', '[ABC]', 0);
       ["one","two","three"]
      > SELECT split('oneAtwoBthreeC', '[ABC]', 2);
       ["one","twoBthreeC"]

examples = """
Examples:
> SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');
["one","two","three",""]
> SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', -1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is better to keep original example for default value.

["one","two","three",""]
> SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', 2);
["one","twoBthreeC"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add the netative case?

""")
case class StringSplit(str: Expression, pattern: Expression)
extends BinaryExpression with ImplicitCastInputTypes {
case class StringSplit(str: Expression, regex: Expression, limit: Expression)
extends TernaryExpression with ImplicitCastInputTypes {

override def left: Expression = str
override def right: Expression = pattern
override def dataType: DataType = ArrayType(StringType)
override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)
override def children: Seq[Expression] = str :: regex :: limit :: Nil

override def nullSafeEval(string: Any, regex: Any): Any = {
val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1)
def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1));

override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we still need to do some check on limit. According to Presto document, limit must be a positive number. -1 is only used when no limit parameter is given (default value).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viirya the underlying implementation of this method is Java.lang.String, correct? This method does allow non-positive values for limit, not sure what Presto is using.

val strings = string.asInstanceOf[UTF8String].split(
regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int])
new GenericArrayData(strings.asInstanceOf[Array[Any]])
}

override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
val arrayClass = classOf[GenericArrayData].getName
nullSafeCodeGen(ctx, ev, (str, pattern) =>
nullSafeCodeGen(ctx, ev, (str, regex, limit) => {
// Array in java is covariant, so we don't need to cast UTF8String[] to Object[].
s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""")
s"""${ev.value} = new $arrayClass($str.split($regex,$limit));""".stripMargin
})
}

override def prettyName: String = "split"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,18 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
val row3 = create_row("aa2bb3cc", null)

checkEvaluation(
StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+")), Seq("aa", "bb", "cc"), row1)
StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+"), -1), Seq("aa", "bb", "cc"), row1)
checkEvaluation(
StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1)
checkEvaluation(StringSplit(s1, s2), null, row2)
checkEvaluation(StringSplit(s1, s2), null, row3)
StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+"), 2), Seq("aa", "bb3cc"), row1)
// limit = 0 should behave just like limit = -1
checkEvaluation(
StringSplit(Literal("aacbbcddc"), Literal("c"), 0), Seq("aa", "bb", "dd", ""), row1)
checkEvaluation(
StringSplit(Literal("aacbbcddc"), Literal("c"), -1), Seq("aa", "bb", "dd", ""), row1)
checkEvaluation(
StringSplit(s1, s2, -1), Seq("aa", "bb", "cc"), row1)
checkEvaluation(StringSplit(s1, s2, -1), null, row2)
checkEvaluation(StringSplit(s1, s2, -1), null, row3)
}

}
32 changes: 28 additions & 4 deletions sql/core/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2546,15 +2546,39 @@ object functions {
def soundex(e: Column): Column = withExpr { SoundEx(e.expr) }

/**
* Splits str around pattern (pattern is a regular expression).
* Splits str around matches of the given regex.
*
* @note Pattern is a string representation of the regular expression.
* @param str a string expression to split
* @param regex a string representing a regular expression. The regex string should be
* a Java regular expression.
*
* @group string_funcs
* @since 1.5.0
*/
def split(str: Column, pattern: String): Column = withExpr {
StringSplit(str.expr, lit(pattern).expr)
def split(str: Column, regex: String): Column = withExpr {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we just keep it as pattern? I think we don't better change the name. Doesn;t pattern also make sense?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason I changed it is that every time we mentioned pattern in the comments/docs, we always added a phrase like "pattern, which is a regular expression ..."

just felt like unnecessary explanation needed if we called the variable regex. Happy to change if you think necessary though!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, I don't think we should change the name in case either makes sense in a way.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

by having it as regex, the documentation will require less explanation. @HyukjinKwon if you are ok with keeping it as regex I think we should prefer to keep this change.

Happy to revert as well of course

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this an API breaking change?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes it is for source compatibility in scala

Copy link
Contributor

@cloud-fan cloud-fan Mar 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea Scala is sensitive to parameter name, as the caller can do: split(str = ..., pattern = ...)

so this is binary-compatible but not source-compatible. @HyukjinKwon can you help revert this line?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, but for the record such changes already have been made so far not only in SQL but SS sides if I am not remembering wrongly because users are expected to likely edit their source when they compile against Spark 3.0, and it doesn't break existing compiled apps. I am not sure why this one is special but sure it's easy to keep the compat with a minimal change.

StringSplit(str.expr, Literal(regex), Literal(-1))
}

/**
* Splits str around matches of the given regex.
*
* @param str a string expression to split
* @param regex a string representing a regular expression. The regex string should be
* a Java regular expression.
* @param limit an integer expression which controls the number of times the regex is applied.
* <ul>
* <li>limit greater than 0: The resulting array's length will not be more than limit,
* and the resulting array's last entry will contain all input beyond the last
* matched regex.</li>
* <li>limit less than or equal to 0: `regex` will be applied as many times as
* possible, and the resulting array can be of any size.</li>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we don't need </li>.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was asked to do <li> earlier in this PR conversation. @HyukjinKwon -- thoughts here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean we may not need ending tag </li>.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, I'll look into that

Copy link
Author

@phegstrom phegstrom Oct 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viirya throughout this repository, the </li> has always been included. For consistency, I think we should just keep it as is. Let me know what you think

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. Then it's fine. Thanks for looking at it.

* </ul>
Copy link
Member

@HyukjinKwon HyukjinKwon Sep 10, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use the same way to make it multiple lines

* <ul>
* <li>`primitivesAsString` (default `false`): infers all primitive values as a string type</li>
* <li>`prefersDecimal` (default `false`): infers all floating-point values as a decimal
* type. If the values do not fit in decimal, then it infers them as doubles.</li>
* <li>`allowComments` (default `false`): ignores Java/C++ style comment in JSON records</li>
* <li>`allowUnquotedFieldNames` (default `false`): allows unquoted JSON field names</li>
* <li>`allowSingleQuotes` (default `true`): allows single quotes in addition to double quotes
* </li>
* <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers
* (e.g. 00012)</li>
* <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
* character using backslash quoting mechanism</li>
* <li>`allowUnquotedControlChars` (default `false`): allows JSON Strings to contain unquoted
* control characters (ASCII characters with value less than 32, including tab and line feed
* characters) or not.</li>
* <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
* during parsing.
* <ul>
* <li>`PERMISSIVE` : when it meets a corrupted record, puts the malformed string into a
* field configured by `columnNameOfCorruptRecord`, and sets other fields to `null`. To
* keep corrupt records, an user can set a string type field named
* `columnNameOfCorruptRecord` in an user-defined schema. If a schema does not have the
* field, it drops corrupt records during parsing. When inferring a schema, it implicitly
* adds a `columnNameOfCorruptRecord` field in an output schema.</li>
* <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>
* <li>`FAILFAST` : throws an exception when it meets corrupted records.</li>
* </ul>
* </li>
* <li>`columnNameOfCorruptRecord` (default is the value specified in
* `spark.sql.columnNameOfCorruptRecord`): allows renaming the new field having malformed string
* created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.</li>
* <li>`dateFormat` (default `yyyy-MM-dd`): sets the string that indicates a date format.
* Custom date formats follow the formats at `java.text.SimpleDateFormat`. This applies to
* date type.</li>
* <li>`timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSXXX`): sets the string that
* indicates a timestamp format. Custom date formats follow the formats at
* `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
* <li>`multiLine` (default `false`): parse one record, which may span multiple lines,
* per file</li>
* <li>`encoding` (by default it is not set): allows to forcibly set one of standard basic
* or extended encoding for the JSON files. For example UTF-16BE, UTF-32LE. If the encoding
* is not specified and `multiLine` is set to `true`, it will be detected automatically.</li>
* <li>`lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line separator
* that should be used for parsing.</li>
* <li>`samplingRatio` (default is 1.0): defines fraction of input JSON objects used
* for schema inferring.</li>
* <li>`dropFieldIfAllNull` (default `false`): whether to ignore column of all null values or
* empty array/struct during schema inference.</li>
* </ul>

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can just:

   *        <ul>
   *          <li>limit greater than 0: The resulting array's length will not be more than limit,
   *          and the resulting array's last entry will contain all input
   *          beyond the last matched regex.</li>
   *          <li>limit less than or equal to 0: `regex` will be applied as many times as possible,
   *          and the resulting array can be of any size.</li>
   *        </ul>

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh I thought you wanted to have the explanations as sub bullets, will make that change @HyukjinKwon

*
* @group string_funcs
* @since 3.0.0
*/
def split(str: Column, regex: String, limit: Int): Column = withExpr {
StringSplit(str.expr, Literal(regex), Literal(limit))
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,8 @@ FROM (
encode(string(id + 2), 'utf-8') col3,
encode(string(id + 3), 'utf-8') col4
FROM range(10)
)
);

-- split function
SELECT split('aa1cc2ee3', '[1-9]+');
SELECT split('aa1cc2ee3', '[1-9]+', 2);
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 15
-- Number of queries: 17


-- !query 0
Expand Down Expand Up @@ -161,3 +161,19 @@ struct<plan:string>
== Physical Plan ==
*Project [concat(cast(id#xL as string), cast(encode(cast((id#xL + 2) as string), utf-8) as string), cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]
+- *Range (0, 10, step=1, splits=2)


-- !query 15
SELECT split('aa1cc2ee3', '[1-9]+')
-- !query 15 schema
struct<split(aa1cc2ee3, [1-9]+, -1):array<string>>
-- !query 15 output
["aa","cc","ee",""]


-- !query 16
SELECT split('aa1cc2ee3', '[1-9]+', 2)
-- !query 16 schema
struct<split(aa1cc2ee3, [1-9]+, 2):array<string>>
-- !query 16 output
["aa","cc2ee3"]
Original file line number Diff line number Diff line change
Expand Up @@ -329,16 +329,52 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
Row(" "))
}

test("string split function") {
val df = Seq(("aa2bb3cc", "[1-9]+")).toDF("a", "b")
test("string split function with no limit") {
val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")

checkAnswer(
df.select(split($"a", "[1-9]+")),
Row(Seq("aa", "bb", "cc")))
Row(Seq("aa", "bb", "cc", "")))

checkAnswer(
df.selectExpr("split(a, '[1-9]+')"),
Row(Seq("aa", "bb", "cc")))
Row(Seq("aa", "bb", "cc", "")))
}

test("string split function with limit explicitly set to 0") {
val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")

checkAnswer(
df.select(split($"a", "[1-9]+", 0)),
Row(Seq("aa", "bb", "cc", "")))

checkAnswer(
df.selectExpr("split(a, '[1-9]+', 0)"),
Row(Seq("aa", "bb", "cc", "")))
}

test("string split function with positive limit") {
val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")

checkAnswer(
df.select(split($"a", "[1-9]+", 2)),
Row(Seq("aa", "bb3cc4")))

checkAnswer(
df.selectExpr("split(a, '[1-9]+', 2)"),
Row(Seq("aa", "bb3cc4")))
}

test("string split function with negative limit") {
val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")

checkAnswer(
df.select(split($"a", "[1-9]+", -2)),
Row(Seq("aa", "bb", "cc", "")))

checkAnswer(
df.selectExpr("split(a, '[1-9]+', -2)"),
Row(Seq("aa", "bb", "cc", "")))
}

test("string / binary length function") {
Expand Down