Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -232,30 +232,49 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress
* Splits str around pat (pattern is a regular expression).
*/
@ExpressionDescription(
usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match `regex`.",
usage = "_FUNC_(str, regex, limit) - Splits `str` around occurrences that match `regex`." +
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you refine the description and the format along with the others, e.g., RLike

* pattern - a string expression. The pattern is a string which is matched literally, with

"The `limit` parameter controls the number of times the pattern is applied. If the limit " +
"n is greater than zero then the pattern will be applied at most n - 1 times, " +
"the array's length will be no greater than n, and the array's last entry " +
"will contain all input beyond the last matched delimiter. If n is " +
"less than 0, then the pattern will be applied as many times as " +
"possible and the array can have any length. If n is zero then the " +
"pattern will be applied as many times as possible, the array can " +
"have any length, and trailing empty strings will be discarded.",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, is it possible to make this usage more compact? I think the usage here should be concise.

Copy link
Author

@phegstrom phegstrom Aug 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viirya i'll take a crack at it -- the usage is a bit funky given the different behavior based on what limit is, I wanted to err on the side of verbose

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for #22227 (comment). The doc should better be concise.

Can we just move those limit specific description into the arguments at limit - a..? This looks a bit messy.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do!

arguments = """
Arguments:
* str - a string expression to split.
* pattern - a string representing a regular expression. The pattern string should be a
Java regular expression.
* limit - an integer expression.
""",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about this formatting?;


 function_desc | Extended Usage:
    Arguments:
      * str - a string expression to split.
      * pattern - a string representing a regular expression. The pattern string should be a
        Java regular expression.
      * limit - an integer expression which controls the number of times the pattern is applied.

        limit > 0: The resulting array's length will not be more than `limit`, and the resulting array's
                   last entry will contain all input beyond the last matched pattern.
        limit < 0: `pattern` will be applied as many times as possible, and the resulting
                   array can be of any size.
        limit = 0: `pattern` will be applied as many times as possible, the resulting array can
                   be of any size, and trailing empty strings will be discarded.
  
    Examples:
      > SELECT split('oneAtwoBthreeC', '[ABC]');
       ["one","two","three",""]
      > SELECT split('oneAtwoBthreeC', '[ABC]', 0);
       ["one","two","three"]
      > SELECT split('oneAtwoBthreeC', '[ABC]', 2);
       ["one","twoBthreeC"]

examples = """
Examples:
> SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');
["one","two","three",""]
| > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', 2);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

["one","twoBthreeC"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add the netative case?

""")
case class StringSplit(str: Expression, pattern: Expression)
extends BinaryExpression with ImplicitCastInputTypes {
case class StringSplit(str: Expression, pattern: Expression, limit: Expression)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We still need to support 2 arguments. Please add a constructor def this(str: Expression, pattern: Expression).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For test coverage, better to add tests in string-functions.sql for the two cases: two arguments and three arguments.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@maropu which tests use string-functions.sql? would like to add tests here but not sure how to explicitly kick off the test as there are no *Suites which use this file it seems.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^ ignore this! found it @maropu

extends TernaryExpression with ImplicitCastInputTypes {

override def left: Expression = str
override def right: Expression = pattern
override def dataType: DataType = ArrayType(StringType)
override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)
override def children: Seq[Expression] = str :: pattern :: limit :: Nil

def this(exp: Expression, pattern: Expression) = this(exp, pattern, Literal(-1));

override def nullSafeEval(string: Any, regex: Any): Any = {
val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1)
override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we still need to do some check on limit. According to Presto document, limit must be a positive number. -1 is only used when no limit parameter is given (default value).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viirya the underlying implementation of this method is Java.lang.String, correct? This method does allow non-positive values for limit, not sure what Presto is using.

val strings = string.asInstanceOf[UTF8String].split(
regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int])
new GenericArrayData(strings.asInstanceOf[Array[Any]])
}

override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
val arrayClass = classOf[GenericArrayData].getName
nullSafeCodeGen(ctx, ev, (str, pattern) =>
nullSafeCodeGen(ctx, ev, (str, pattern, limit) =>
// Array in java is covariant, so we don't need to cast UTF8String[] to Object[].
s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""")
s"""${ev.value} = new $arrayClass($str.split($pattern, $limit));""")
}

override def prettyName: String = "split"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,17 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
val row3 = create_row("aa2bb3cc", null)

checkEvaluation(
StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+")), Seq("aa", "bb", "cc"), row1)
StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+"), -1), Seq("aa", "bb", "cc"), row1)
checkEvaluation(
StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1)
checkEvaluation(StringSplit(s1, s2), null, row2)
checkEvaluation(StringSplit(s1, s2), null, row3)
StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+"), 2), Seq("aa", "bb3cc"), row1)
checkEvaluation(
StringSplit(Literal("aacbbcddc"), Literal("c"), 0), Seq("aa", "bb", "dd"), row1)
checkEvaluation(
StringSplit(Literal("aacbbcddc"), Literal("c"), -1), Seq("aa", "bb", "dd", ""), row1)
checkEvaluation(
StringSplit(s1, s2, -1), Seq("aa", "bb", "cc"), row1)
checkEvaluation(StringSplit(s1, s2, -1), null, row2)
checkEvaluation(StringSplit(s1, s2, -1), null, row3)
}

}
22 changes: 21 additions & 1 deletion sql/core/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2554,7 +2554,27 @@ object functions {
* @since 1.5.0
*/
def split(str: Column, pattern: String): Column = withExpr {
StringSplit(str.expr, lit(pattern).expr)
StringSplit(str.expr, Literal(pattern), Literal(-1))
}

/**
* Splits str around pattern (pattern is a regular expression).
*
* The limit parameter controls the number of times the pattern is applied and therefore
* affects the length of the resulting array. If the limit n is greater than zero then the
* pattern will be applied at most n - 1 times, the array's length will be no greater than
* n, and the array's last entry will contain all input beyond the last matched delimiter.
* If n is non-positive then the pattern will be applied as many times as possible and the
* array can have any length. If n is zero then the pattern will be applied as many times as
* possible, the array can have any length, and trailing empty strings will be discarded.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you copy SQL's doc here? You could describe them via @param here as well.

*
* @note Pattern is a string representation of the regular expression.
*
* @group string_funcs
* @since 2.4.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto for 3.0.0

*/
def split(str: Column, pattern: String, limit: Int): Column = withExpr {
StringSplit(str.expr, Literal(pattern), Literal(limit))
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ select format_string();
-- A pipe operator for string concatenation
select 'a' || 'b' || 'c';

-- split function
select split('aa1cc2ee', '[1-9]+', 2);
select split('aa1cc2ee', '[1-9]+');

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you move these tests to the end of this file in order to decrease unnecessary changes in the golden file.

-- Check if catalyst combine nested `Concat`s
EXPLAIN EXTENDED SELECT (col1 || col2 || col3 || col4) col
FROM (SELECT id col1, id col2, id col3, id col4 FROM range(10));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 15
-- Number of queries: 17


-- !query 0
Expand Down Expand Up @@ -29,11 +29,27 @@ abc


-- !query 3
select split('aa1cc2ee', '[1-9]+', 2)
-- !query 3 schema
struct<split(aa1cc2ee, [1-9]+, 2):array<string>>
-- !query 3 output
["aa","cc2ee"]


-- !query 4
select split('aa1cc2ee', '[1-9]+')
-- !query 4 schema
struct<split(aa1cc2ee, [1-9]+, -1):array<string>>
-- !query 4 output
["aa","cc","ee"]


-- !query 5
EXPLAIN EXTENDED SELECT (col1 || col2 || col3 || col4) col
FROM (SELECT id col1, id col2, id col3, id col4 FROM range(10))
-- !query 3 schema
-- !query 5 schema
struct<plan:string>
-- !query 3 output
-- !query 5 output
== Parsed Logical Plan ==
'Project [concat(concat(concat('col1, 'col2), 'col3), 'col4) AS col#x]
+- 'SubqueryAlias `__auto_generated_subquery_name`
Expand All @@ -56,79 +72,79 @@ Project [concat(cast(id#xL as string), cast(id#xL as string), cast(id#xL as stri
+- *Range (0, 10, step=1, splits=2)


-- !query 4
-- !query 6
select replace('abc', 'b', '123')
-- !query 4 schema
-- !query 6 schema
struct<replace(abc, b, 123):string>
-- !query 4 output
-- !query 6 output
a123c


-- !query 5
-- !query 7
select replace('abc', 'b')
-- !query 5 schema
-- !query 7 schema
struct<replace(abc, b, ):string>
-- !query 5 output
-- !query 7 output
ac


-- !query 6
-- !query 8
select length(uuid()), (uuid() <> uuid())
-- !query 6 schema
-- !query 8 schema
struct<length(uuid()):int,(NOT (uuid() = uuid())):boolean>
-- !query 6 output
-- !query 8 output
36 true


-- !query 7
-- !query 9
select position('bar' in 'foobarbar'), position(null, 'foobarbar'), position('aaads', null)
-- !query 7 schema
-- !query 9 schema
struct<locate(bar, foobarbar, 1):int,locate(CAST(NULL AS STRING), foobarbar, 1):int,locate(aaads, CAST(NULL AS STRING), 1):int>
-- !query 7 output
-- !query 9 output
4 NULL NULL


-- !query 8
-- !query 10
select left("abcd", 2), left("abcd", 5), left("abcd", '2'), left("abcd", null)
-- !query 8 schema
-- !query 10 schema
struct<left('abcd', 2):string,left('abcd', 5):string,left('abcd', '2'):string,left('abcd', NULL):string>
-- !query 8 output
-- !query 10 output
ab abcd ab NULL


-- !query 9
-- !query 11
select left(null, -2), left("abcd", -2), left("abcd", 0), left("abcd", 'a')
-- !query 9 schema
-- !query 11 schema
struct<left(NULL, -2):string,left('abcd', -2):string,left('abcd', 0):string,left('abcd', 'a'):string>
-- !query 9 output
-- !query 11 output
NULL NULL


-- !query 10
-- !query 12
select right("abcd", 2), right("abcd", 5), right("abcd", '2'), right("abcd", null)
-- !query 10 schema
-- !query 12 schema
struct<right('abcd', 2):string,right('abcd', 5):string,right('abcd', '2'):string,right('abcd', NULL):string>
-- !query 10 output
-- !query 12 output
cd abcd cd NULL


-- !query 11
-- !query 13
select right(null, -2), right("abcd", -2), right("abcd", 0), right("abcd", 'a')
-- !query 11 schema
-- !query 13 schema
struct<right(NULL, -2):string,right('abcd', -2):string,right('abcd', 0):string,right('abcd', 'a'):string>
-- !query 11 output
-- !query 13 output
NULL NULL


-- !query 12
-- !query 14
set spark.sql.function.concatBinaryAsString=false
-- !query 12 schema
-- !query 14 schema
struct<key:string,value:string>
-- !query 12 output
-- !query 14 output
spark.sql.function.concatBinaryAsString false


-- !query 13
-- !query 15
EXPLAIN SELECT ((col1 || col2) || (col3 || col4)) col
FROM (
SELECT
Expand All @@ -138,15 +154,15 @@ FROM (
encode(string(id + 3), 'utf-8') col4
FROM range(10)
)
-- !query 13 schema
-- !query 15 schema
struct<plan:string>
-- !query 13 output
-- !query 15 output
== Physical Plan ==
*Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), cast(encode(cast((id#xL + 2) as string), utf-8) as string), cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]
+- *Range (0, 10, step=1, splits=2)


-- !query 14
-- !query 16
EXPLAIN SELECT (col1 || (col3 || col4)) col
FROM (
SELECT
Expand All @@ -155,9 +171,9 @@ FROM (
encode(string(id + 3), 'utf-8') col4
FROM range(10)
)
-- !query 14 schema
-- !query 16 schema
struct<plan:string>
-- !query 14 output
-- !query 16 output
== Physical Plan ==
*Project [concat(cast(id#xL as string), cast(encode(cast((id#xL + 2) as string), utf-8) as string), cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]
+- *Range (0, 10, step=1, splits=2)
Original file line number Diff line number Diff line change
Expand Up @@ -329,16 +329,52 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
Row(" "))
}

test("string split function") {
val df = Seq(("aa2bb3cc", "[1-9]+")).toDF("a", "b")
test("string split function with no limit") {
val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")

checkAnswer(
df.select(split($"a", "[1-9]+")),
Row(Seq("aa", "bb", "cc")))
Row(Seq("aa", "bb", "cc", "")))

checkAnswer(
df.selectExpr("split(a, '[1-9]+')"),
Row(Seq("aa", "bb", "cc", "")))
}

test("string split function with limit explicitly set to 0") {
val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")

checkAnswer(
df.select(split($"a", "[1-9]+", 0)),
Row(Seq("aa", "bb", "cc")))

checkAnswer(
df.selectExpr("split(a, '[1-9]+', 0)"),
Row(Seq("aa", "bb", "cc")))
}

test("string split function with positive limit") {
val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")

checkAnswer(
df.select(split($"a", "[1-9]+", 2)),
Row(Seq("aa", "bb3cc4")))

checkAnswer(
df.selectExpr("split(a, '[1-9]+', 2)"),
Row(Seq("aa", "bb3cc4")))
}

test("string split function with negative limit") {
val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")

checkAnswer(
df.select(split($"a", "[1-9]+", -2)),
Row(Seq("aa", "bb", "cc", "")))

checkAnswer(
df.selectExpr("split(a, '[1-9]+', -2)"),
Row(Seq("aa", "bb", "cc", "")))
}

test("string / binary length function") {
Expand Down