apache · phegstrom · Aug 24, 2018 · Aug 24, 2018 · Aug 27, 2018 · Aug 27, 2018
diff --git a/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -232,30 +232,49 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress
  * Splits str around pat (pattern is a regular expression).
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match `regex`.",
+  usage = "_FUNC_(str, regex, limit) - Splits `str` around occurrences that match `regex`." +
       * pattern - a string expression. The pattern is a string which is matched literally, with 
       * pattern - a string expression. The pattern is a string which is matched literally, with 
+    "The `limit` parameter controls the number of times the pattern is applied. If the limit " +
+    "n is greater than zero then the pattern will be applied at most n - 1 times, " +
+    "the array's length will be no greater than n, and the array's last entry " +
+    "will contain all input beyond the last matched delimiter. If n is " +
+    "less than 0, then the pattern will be applied as many times as " +
+    "possible and the array can have any length. If n is zero then the " +
+    "pattern will be applied as many times as possible, the array can " +
+    "have any length, and trailing empty strings will be discarded.",
+  arguments = """
+    Arguments:
+      * str - a string expression to split.
+      * pattern - a string representing a regular expression. The pattern string should be a
+        Java regular expression.
+      * limit - an integer expression.
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');
        ["one","two","three",""]
+|     > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', 2);
+       ["one","twoBthreeC"]
   """)
-case class StringSplit(str: Expression, pattern: Expression)
-  extends BinaryExpression with ImplicitCastInputTypes {
+case class StringSplit(str: Expression, pattern: Expression, limit: Expression)
+  extends TernaryExpression with ImplicitCastInputTypes {
 
-  override def left: Expression = str
-  override def right: Expression = pattern
   override def dataType: DataType = ArrayType(StringType)
-  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
+  override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)
+  override def children: Seq[Expression] = str :: pattern :: limit :: Nil
+
+  def this(exp: Expression, pattern: Expression) = this(exp, pattern, Literal(-1));
 
-  override def nullSafeEval(string: Any, regex: Any): Any = {
-    val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1)
+  override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
+    val strings = string.asInstanceOf[UTF8String].split(
+      regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int])
     new GenericArrayData(strings.asInstanceOf[Array[Any]])
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val arrayClass = classOf[GenericArrayData].getName
-    nullSafeCodeGen(ctx, ev, (str, pattern) =>
+    nullSafeCodeGen(ctx, ev, (str, pattern, limit) =>
       // Array in java is covariant, so we don't need to cast UTF8String[] to Object[].
-      s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""")
+      s"""${ev.value} = new $arrayClass($str.split($pattern, $limit));""")
   }
 
   override def prettyName: String = "split"

diff --git a/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -225,11 +225,17 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val row3 = create_row("aa2bb3cc", null)
 
     checkEvaluation(
-      StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+")), Seq("aa", "bb", "cc"), row1)
+      StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+"), -1), Seq("aa", "bb", "cc"), row1)
     checkEvaluation(
-      StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1)
-    checkEvaluation(StringSplit(s1, s2), null, row2)
-    checkEvaluation(StringSplit(s1, s2), null, row3)
+      StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+"), 2), Seq("aa", "bb3cc"), row1)
+    checkEvaluation(
+      StringSplit(Literal("aacbbcddc"), Literal("c"), 0), Seq("aa", "bb", "dd"), row1)
+    checkEvaluation(
+      StringSplit(Literal("aacbbcddc"), Literal("c"), -1), Seq("aa", "bb", "dd", ""), row1)
+    checkEvaluation(
+      StringSplit(s1, s2, -1), Seq("aa", "bb", "cc"), row1)
+    checkEvaluation(StringSplit(s1, s2, -1), null, row2)
+    checkEvaluation(StringSplit(s1, s2, -1), null, row3)
   }
 
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2554,7 +2554,27 @@ object functions {
    * @since 1.5.0
    */
   def split(str: Column, pattern: String): Column = withExpr {
-    StringSplit(str.expr, lit(pattern).expr)
+    StringSplit(str.expr, Literal(pattern), Literal(-1))
+  }
+
+  /**
+   * Splits str around pattern (pattern is a regular expression).
+   *
+   * The limit parameter controls the number of times the pattern is applied and therefore
+   * affects the length of the resulting array. If the limit n is greater than zero then the
+   * pattern will be applied at most n - 1 times, the array's length will be no greater than
+   * n, and the array's last entry will contain all input beyond the last matched delimiter.
+   * If n is non-positive then the pattern will be applied as many times as possible and the
+   * array can have any length. If n is zero then the pattern will be applied as many times as
+   * possible, the array can have any length, and trailing empty strings will be discarded.
+   *
+   * @note Pattern is a string representation of the regular expression.
+   *
+   * @group string_funcs
+   * @since 2.4.0
+   */
+  def split(str: Column, pattern: String, limit: Int): Column = withExpr {
+    StringSplit(str.expr, Literal(pattern), Literal(limit))
   }
 
   /**

diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -5,6 +5,10 @@ select format_string();
 -- A pipe operator for string concatenation
 select 'a' || 'b' || 'c';
 
+-- split function
+select split('aa1cc2ee', '[1-9]+', 2);
+select split('aa1cc2ee', '[1-9]+');
+
 -- Check if catalyst combine nested `Concat`s
 EXPLAIN EXTENDED SELECT (col1 || col2 || col3 || col4) col
 FROM (SELECT id col1, id col2, id col3, id col4 FROM range(10));

diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 15
+-- Number of queries: 17
 
 
 -- !query 0
@@ -29,11 +29,27 @@ abc
 
 
 -- !query 3
+select split('aa1cc2ee', '[1-9]+', 2)
+-- !query 3 schema
+struct<split(aa1cc2ee, [1-9]+, 2):array<string>>
+-- !query 3 output
+["aa","cc2ee"]
+
+
+-- !query 4
+select split('aa1cc2ee', '[1-9]+')
+-- !query 4 schema
+struct<split(aa1cc2ee, [1-9]+, -1):array<string>>
+-- !query 4 output
+["aa","cc","ee"]
+
+
+-- !query 5
 EXPLAIN EXTENDED SELECT (col1 || col2 || col3 || col4) col
 FROM (SELECT id col1, id col2, id col3, id col4 FROM range(10))
--- !query 3 schema
+-- !query 5 schema
 struct<plan:string>
--- !query 3 output
+-- !query 5 output
 == Parsed Logical Plan ==
 'Project [concat(concat(concat('col1, 'col2), 'col3), 'col4) AS col#x]
 +- 'SubqueryAlias `__auto_generated_subquery_name`
@@ -56,79 +72,79 @@ Project [concat(cast(id#xL as string), cast(id#xL as string), cast(id#xL as stri
 +- *Range (0, 10, step=1, splits=2)
 
 
--- !query 4
+-- !query 6
 select replace('abc', 'b', '123')
--- !query 4 schema
+-- !query 6 schema
 struct<replace(abc, b, 123):string>
--- !query 4 output
+-- !query 6 output
 a123c
 
 
--- !query 5
+-- !query 7
 select replace('abc', 'b')
--- !query 5 schema
+-- !query 7 schema
 struct<replace(abc, b, ):string>
--- !query 5 output
+-- !query 7 output
 ac
 
 
--- !query 6
+-- !query 8
 select length(uuid()), (uuid() <> uuid())
--- !query 6 schema
+-- !query 8 schema
 struct<length(uuid()):int,(NOT (uuid() = uuid())):boolean>
--- !query 6 output
+-- !query 8 output
 36	true
 
 
--- !query 7
+-- !query 9
 select position('bar' in 'foobarbar'), position(null, 'foobarbar'), position('aaads', null)
--- !query 7 schema
+-- !query 9 schema
 struct<locate(bar, foobarbar, 1):int,locate(CAST(NULL AS STRING), foobarbar, 1):int,locate(aaads, CAST(NULL AS STRING), 1):int>
--- !query 7 output
+-- !query 9 output
 4	NULL	NULL
 
 
--- !query 8
+-- !query 10
 select left("abcd", 2), left("abcd", 5), left("abcd", '2'), left("abcd", null)
--- !query 8 schema
+-- !query 10 schema
 struct<left('abcd', 2):string,left('abcd', 5):string,left('abcd', '2'):string,left('abcd', NULL):string>
--- !query 8 output
+-- !query 10 output
 ab	abcd	ab	NULL
 
 
--- !query 9
+-- !query 11
 select left(null, -2), left("abcd", -2), left("abcd", 0), left("abcd", 'a')
--- !query 9 schema
+-- !query 11 schema
 struct<left(NULL, -2):string,left('abcd', -2):string,left('abcd', 0):string,left('abcd', 'a'):string>
--- !query 9 output
+-- !query 11 output
 NULL			NULL
 
 
--- !query 10
+-- !query 12
 select right("abcd", 2), right("abcd", 5), right("abcd", '2'), right("abcd", null)
--- !query 10 schema
+-- !query 12 schema
 struct<right('abcd', 2):string,right('abcd', 5):string,right('abcd', '2'):string,right('abcd', NULL):string>
--- !query 10 output
+-- !query 12 output
 cd	abcd	cd	NULL
 
 
--- !query 11
+-- !query 13
 select right(null, -2), right("abcd", -2), right("abcd", 0), right("abcd", 'a')
--- !query 11 schema
+-- !query 13 schema
 struct<right(NULL, -2):string,right('abcd', -2):string,right('abcd', 0):string,right('abcd', 'a'):string>
--- !query 11 output
+-- !query 13 output
 NULL			NULL
 
 
--- !query 12
+-- !query 14
 set spark.sql.function.concatBinaryAsString=false
--- !query 12 schema
+-- !query 14 schema
 struct<key:string,value:string>
--- !query 12 output
+-- !query 14 output
 spark.sql.function.concatBinaryAsString	false
 
 
--- !query 13
+-- !query 15
 EXPLAIN SELECT ((col1 || col2) || (col3 || col4)) col
 FROM (
   SELECT
@@ -138,15 +154,15 @@ FROM (
     encode(string(id + 3), 'utf-8') col4
   FROM range(10)
 )
--- !query 13 schema
+-- !query 15 schema
 struct<plan:string>
--- !query 13 output
+-- !query 15 output
 == Physical Plan ==
 *Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), cast(encode(cast((id#xL + 2) as string), utf-8) as string), cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]
 +- *Range (0, 10, step=1, splits=2)
 
 
--- !query 14
+-- !query 16
 EXPLAIN SELECT (col1 || (col3 || col4)) col
 FROM (
   SELECT
@@ -155,9 +171,9 @@ FROM (
     encode(string(id + 3), 'utf-8') col4
   FROM range(10)
 )
--- !query 14 schema
+-- !query 16 schema
 struct<plan:string>
--- !query 14 output
+-- !query 16 output
 == Physical Plan ==
 *Project [concat(cast(id#xL as string), cast(encode(cast((id#xL + 2) as string), utf-8) as string), cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]
 +- *Range (0, 10, step=1, splits=2)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -329,16 +329,52 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
       Row("   "))
   }
 
-  test("string split function") {
-    val df = Seq(("aa2bb3cc", "[1-9]+")).toDF("a", "b")
+  test("string split function with no limit") {
+    val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")
 
     checkAnswer(
       df.select(split($"a", "[1-9]+")),
-      Row(Seq("aa", "bb", "cc")))
+      Row(Seq("aa", "bb", "cc", "")))
 
     checkAnswer(
       df.selectExpr("split(a, '[1-9]+')"),
+      Row(Seq("aa", "bb", "cc", "")))
+  }
+
+  test("string split function with limit explicitly set to 0") {
+    val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(split($"a", "[1-9]+", 0)),
       Row(Seq("aa", "bb", "cc")))
+
+    checkAnswer(
+      df.selectExpr("split(a, '[1-9]+', 0)"),
+      Row(Seq("aa", "bb", "cc")))
+  }
+
+  test("string split function with positive limit") {
+    val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(split($"a", "[1-9]+", 2)),
+      Row(Seq("aa", "bb3cc4")))
+
+    checkAnswer(
+      df.selectExpr("split(a, '[1-9]+', 2)"),
+      Row(Seq("aa", "bb3cc4")))
+  }
+
+  test("string split function with negative limit") {
+    val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(split($"a", "[1-9]+", -2)),
+      Row(Seq("aa", "bb", "cc", "")))
+
+    checkAnswer(
+      df.selectExpr("split(a, '[1-9]+', -2)"),
+      Row(Seq("aa", "bb", "cc", "")))
   }
 
   test("string / binary length function") {