-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25202] [SQL] Implements split with limit sql function #22227
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
15362be
ceb3f41
e564a68
4e10733
5135cb2
e8c8c8c
8e16328
ca23ea3
96bc875
79599eb
a27c848
fa128db
a641106
7e4ba98
d80b1a1
d17d2df
64b0afc
4e84df0
b12ee88
69d2190
b5994ad
5c8f487
34ba74f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -232,30 +232,49 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress | |
| * Splits str around pat (pattern is a regular expression). | ||
| */ | ||
| @ExpressionDescription( | ||
| usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match `regex`.", | ||
| usage = "_FUNC_(str, regex, limit) - Splits `str` around occurrences that match `regex`." + | ||
| "The `limit` parameter controls the number of times the pattern is applied. If the limit " + | ||
| "n is greater than zero then the pattern will be applied at most n - 1 times, " + | ||
| "the array's length will be no greater than n, and the array's last entry " + | ||
| "will contain all input beyond the last matched delimiter. If n is " + | ||
| "less than 0, then the pattern will be applied as many times as " + | ||
| "possible and the array can have any length. If n is zero then the " + | ||
| "pattern will be applied as many times as possible, the array can " + | ||
| "have any length, and trailing empty strings will be discarded.", | ||
|
||
| arguments = """ | ||
| Arguments: | ||
| * str - a string expression to split. | ||
| * pattern - a string representing a regular expression. The pattern string should be a | ||
| Java regular expression. | ||
| * limit - an integer expression. | ||
| """, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about this formatting?; |
||
| examples = """ | ||
| Examples: | ||
| > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]'); | ||
| ["one","two","three",""] | ||
| | > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', 2); | ||
|
||
| ["one","twoBthreeC"] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add the netative case? |
||
| """) | ||
| case class StringSplit(str: Expression, pattern: Expression) | ||
| extends BinaryExpression with ImplicitCastInputTypes { | ||
| case class StringSplit(str: Expression, pattern: Expression, limit: Expression) | ||
|
||
| extends TernaryExpression with ImplicitCastInputTypes { | ||
|
|
||
| override def left: Expression = str | ||
| override def right: Expression = pattern | ||
| override def dataType: DataType = ArrayType(StringType) | ||
| override def inputTypes: Seq[DataType] = Seq(StringType, StringType) | ||
| override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType) | ||
| override def children: Seq[Expression] = str :: pattern :: limit :: Nil | ||
|
|
||
| def this(exp: Expression, pattern: Expression) = this(exp, pattern, Literal(-1)); | ||
|
|
||
| override def nullSafeEval(string: Any, regex: Any): Any = { | ||
| val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1) | ||
| override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we still need to do some check on
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @viirya the underlying implementation of this method is |
||
| val strings = string.asInstanceOf[UTF8String].split( | ||
| regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int]) | ||
| new GenericArrayData(strings.asInstanceOf[Array[Any]]) | ||
| } | ||
|
|
||
| override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
| val arrayClass = classOf[GenericArrayData].getName | ||
| nullSafeCodeGen(ctx, ev, (str, pattern) => | ||
| nullSafeCodeGen(ctx, ev, (str, pattern, limit) => | ||
| // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. | ||
| s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""") | ||
| s"""${ev.value} = new $arrayClass($str.split($pattern, $limit));""") | ||
| } | ||
|
|
||
| override def prettyName: String = "split" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2554,7 +2554,27 @@ object functions { | |
| * @since 1.5.0 | ||
| */ | ||
| def split(str: Column, pattern: String): Column = withExpr { | ||
| StringSplit(str.expr, lit(pattern).expr) | ||
| StringSplit(str.expr, Literal(pattern), Literal(-1)) | ||
| } | ||
|
|
||
| /** | ||
| * Splits str around pattern (pattern is a regular expression). | ||
| * | ||
| * The limit parameter controls the number of times the pattern is applied and therefore | ||
| * affects the length of the resulting array. If the limit n is greater than zero then the | ||
| * pattern will be applied at most n - 1 times, the array's length will be no greater than | ||
| * n, and the array's last entry will contain all input beyond the last matched delimiter. | ||
| * If n is non-positive then the pattern will be applied as many times as possible and the | ||
| * array can have any length. If n is zero then the pattern will be applied as many times as | ||
| * possible, the array can have any length, and trailing empty strings will be discarded. | ||
|
||
| * | ||
| * @note Pattern is a string representation of the regular expression. | ||
| * | ||
| * @group string_funcs | ||
| * @since 2.4.0 | ||
|
||
| */ | ||
| def split(str: Column, pattern: String, limit: Int): Column = withExpr { | ||
| StringSplit(str.expr, Literal(pattern), Literal(limit)) | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,10 @@ select format_string(); | |
| -- A pipe operator for string concatenation | ||
| select 'a' || 'b' || 'c'; | ||
|
|
||
| -- split function | ||
| select split('aa1cc2ee', '[1-9]+', 2); | ||
| select split('aa1cc2ee', '[1-9]+'); | ||
|
|
||
|
||
| -- Check if catalyst combine nested `Concat`s | ||
| EXPLAIN EXTENDED SELECT (col1 || col2 || col3 || col4) col | ||
| FROM (SELECT id col1, id col2, id col3, id col4 FROM range(10)); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you refine the description and the format along with the others, e.g.,
RLikespark/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
Line 78 in ceb3f41