apache · phegstrom · Aug 24, 2018 · Aug 24, 2018 · Aug 27, 2018 · Aug 27, 2018
diff --git a/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -229,33 +229,59 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress
 
 
 /**
- * Splits str around pat (pattern is a regular expression).
+ * Splits str around pattern (pattern is a regular expression).
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str, regex) - Splits `str` around occurrences that match `regex`.",
+  usage = "_FUNC_(str, regex, limit) - Splits `str` around occurrences that match `regex`" +
+    " and returns an array of at most `limit`",
+  arguments = """
+    Arguments:
+      * str - a string expression to split.
+      * pattern - a string representing a regular expression. The pattern string should be a
+        Java regular expression.
+      * limit - an integer expression which controls the number of times the pattern is applied.
+
+        limit > 0:
+          The resulting array's length will not be more than `limit`, and the resulting array's
+          last entry will contain all input beyond the last matched pattern.
+
+        limit < 0:
+          `pattern` will be applied as many times as possible, and the resulting
+          array can be of any size.
+
+        limit = 0:
+          `pattern` will be applied as many times as possible, the resulting array can
+          be of any size, and trailing empty strings will be discarded.
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]');
        ["one","two","three",""]
+|     > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', 0);
+       ["one","two","three"]
+|     > SELECT _FUNC_('oneAtwoBthreeC', '[ABC]', 2);
+       ["one","twoBthreeC"]
   """)
-case class StringSplit(str: Expression, pattern: Expression)
-  extends BinaryExpression with ImplicitCastInputTypes {
+case class StringSplit(str: Expression, pattern: Expression, limit: Expression)
+  extends TernaryExpression with ImplicitCastInputTypes {
 
-  override def left: Expression = str
-  override def right: Expression = pattern
   override def dataType: DataType = ArrayType(StringType)
-  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
+  override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)
+  override def children: Seq[Expression] = str :: pattern :: limit :: Nil
+
+  def this(exp: Expression, pattern: Expression) = this(exp, pattern, Literal(-1));
 
-  override def nullSafeEval(string: Any, regex: Any): Any = {
-    val strings = string.asInstanceOf[UTF8String].split(regex.asInstanceOf[UTF8String], -1)
+  override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = {
+    val strings = string.asInstanceOf[UTF8String].split(
+      regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int])
     new GenericArrayData(strings.asInstanceOf[Array[Any]])
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val arrayClass = classOf[GenericArrayData].getName
-    nullSafeCodeGen(ctx, ev, (str, pattern) =>
+    nullSafeCodeGen(ctx, ev, (str, pattern, limit) =>
       // Array in java is covariant, so we don't need to cast UTF8String[] to Object[].
-      s"""${ev.value} = new $arrayClass($str.split($pattern, -1));""")
+      s"""${ev.value} = new $arrayClass($str.split($pattern, $limit));""")
   }
 
   override def prettyName: String = "split"

diff --git a/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -225,11 +225,17 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val row3 = create_row("aa2bb3cc", null)
 
     checkEvaluation(
-      StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+")), Seq("aa", "bb", "cc"), row1)
+      StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+"), -1), Seq("aa", "bb", "cc"), row1)
     checkEvaluation(
-      StringSplit(s1, s2), Seq("aa", "bb", "cc"), row1)
-    checkEvaluation(StringSplit(s1, s2), null, row2)
-    checkEvaluation(StringSplit(s1, s2), null, row3)
+      StringSplit(Literal("aa2bb3cc"), Literal("[1-9]+"), 2), Seq("aa", "bb3cc"), row1)
+    checkEvaluation(
+      StringSplit(Literal("aacbbcddc"), Literal("c"), 0), Seq("aa", "bb", "dd"), row1)
+    checkEvaluation(
+      StringSplit(Literal("aacbbcddc"), Literal("c"), -1), Seq("aa", "bb", "dd", ""), row1)
+    checkEvaluation(
+      StringSplit(s1, s2, -1), Seq("aa", "bb", "cc"), row1)
+    checkEvaluation(StringSplit(s1, s2, -1), null, row2)
+    checkEvaluation(StringSplit(s1, s2, -1), null, row3)
   }
 
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2554,7 +2554,27 @@ object functions {
    * @since 1.5.0
    */
   def split(str: Column, pattern: String): Column = withExpr {
-    StringSplit(str.expr, lit(pattern).expr)
+    StringSplit(str.expr, Literal(pattern), Literal(-1))
+  }
+
+  /**
+   * Splits str around pattern (pattern is a regular expression).
+   *
+   * The limit parameter controls the number of times the pattern is applied and therefore
+   * affects the length of the resulting array. If the limit n is greater than zero then the
+   * pattern will be applied at most n - 1 times, the array's length will be no greater than
+   * n, and the array's last entry will contain all input beyond the last matched delimiter.
+   * If n is non-positive then the pattern will be applied as many times as possible and the
+   * array can have any length. If n is zero then the pattern will be applied as many times as
+   * possible, the array can have any length, and trailing empty strings will be discarded.
+   *
+   * @note Pattern is a string representation of the regular expression.
+   *
+   * @group string_funcs
+   * @since 2.4.0
+   */
+  def split(str: Column, pattern: String, limit: Int): Column = withExpr {
+    StringSplit(str.expr, Literal(pattern), Literal(limit))
   }
 
   /**

diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -47,3 +47,7 @@ FROM (
     encode(string(id + 3), 'utf-8') col4
   FROM range(10)
 )
+
+-- split function
+select split('aa1cc2ee', '[1-9]+', 2);
+select split('aa1cc2ee', '[1-9]+');
diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 15
+-- Number of queries: 16
 
 
 -- !query 0
@@ -155,9 +155,32 @@ FROM (
     encode(string(id + 3), 'utf-8') col4
   FROM range(10)
 )
+
+select split('aa1cc2ee', '[1-9]+', 2)
 -- !query 14 schema
-struct<plan:string>
+struct<>
 -- !query 14 output
-== Physical Plan ==
-*Project [concat(cast(id#xL as string), cast(encode(cast((id#xL + 2) as string), utf-8) as string), cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]
-+- *Range (0, 10, step=1, splits=2)
+org.apache.spark.sql.catalyst.parser.ParseException
+
+mismatched input 'select' expecting <EOF>(line 10, pos 0)
+
+== SQL ==
+EXPLAIN SELECT (col1 || (col3 || col4)) col
+FROM (
+  SELECT
+    string(id) col1,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
+
+select split('aa1cc2ee', '[1-9]+', 2)
+^^^
+
+
+-- !query 15
+select split('aa1cc2ee', '[1-9]+')
+-- !query 15 schema
+struct<split(aa1cc2ee, [1-9]+, -1):array<string>>
+-- !query 15 output
+["aa","cc","ee"]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -329,16 +329,52 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
       Row("   "))
   }
 
-  test("string split function") {
-    val df = Seq(("aa2bb3cc", "[1-9]+")).toDF("a", "b")
+  test("string split function with no limit") {
+    val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")
 
     checkAnswer(
       df.select(split($"a", "[1-9]+")),
-      Row(Seq("aa", "bb", "cc")))
+      Row(Seq("aa", "bb", "cc", "")))
 
     checkAnswer(
       df.selectExpr("split(a, '[1-9]+')"),
+      Row(Seq("aa", "bb", "cc", "")))
+  }
+
+  test("string split function with limit explicitly set to 0") {
+    val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(split($"a", "[1-9]+", 0)),
       Row(Seq("aa", "bb", "cc")))
+
+    checkAnswer(
+      df.selectExpr("split(a, '[1-9]+', 0)"),
+      Row(Seq("aa", "bb", "cc")))
+  }
+
+  test("string split function with positive limit") {
+    val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(split($"a", "[1-9]+", 2)),
+      Row(Seq("aa", "bb3cc4")))
+
+    checkAnswer(
+      df.selectExpr("split(a, '[1-9]+', 2)"),
+      Row(Seq("aa", "bb3cc4")))
+  }
+
+  test("string split function with negative limit") {
+    val df = Seq(("aa2bb3cc4", "[1-9]+")).toDF("a", "b")
+
+    checkAnswer(
+      df.select(split($"a", "[1-9]+", -2)),
+      Row(Seq("aa", "bb", "cc", "")))
+
+    checkAnswer(
+      df.selectExpr("split(a, '[1-9]+', -2)"),
+      Row(Seq("aa", "bb", "cc", "")))
   }
 
   test("string / binary length function") {