From 1df379ba2ca8b0339aea6cd5ee0edbe551b23e28 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Fri, 10 Sep 2021 18:23:41 +0900 Subject: [PATCH 01/22] Implement scala/python octet_length --- python/pyspark/sql/functions.py | 20 +++++++++++++++++ python/pyspark/sql/functions.pyi | 1 + python/pyspark/sql/tests/test_functions.py | 6 +++++ .../org/apache/spark/sql/functions.scala | 8 +++++++ .../spark/sql/StringFunctionsSuite.scala | 22 +++++++++++++++++++ 5 files changed, 57 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index e418c0d11f8f..6ee3e4e253db 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -3098,6 +3098,26 @@ def length(col): return Column(sc._jvm.functions.length(_to_java_column(col))) +def octet_length(col): + """ + .. versionadded:: 3.3.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + Source column or strings + Returns + ------- + :class:`~pyspark.sql.Column` + Byte length of the col + Examples + ------- + >>> from pyspark.sql.functions import octet_length + >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(octet_length('cat')).collect() + [Row(octet_length(cat)=3), Row(octet_length(cat)=4)] + """ + return _invoke_function_over_column("octet_length", col) + + def translate(srcCol, matching, replace): """A function translate any character in the `srcCol` by a character in `matching`. The characters in `replace` is corresponding to the characters in `matching`. diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi index 143fa133f4fe..f29d252c146a 100644 --- a/python/pyspark/sql/functions.pyi +++ b/python/pyspark/sql/functions.pyi @@ -174,6 +174,7 @@ def bin(col: ColumnOrName) -> Column: ... def hex(col: ColumnOrName) -> Column: ... def unhex(col: ColumnOrName) -> Column: ... def length(col: ColumnOrName) -> Column: ... +def octet_length(col: ColumnOrName) -> Column: ... def translate(srcCol: ColumnOrName, matching: str, replace: str) -> Column: ... def map_from_arrays(col1: ColumnOrName, col2: ColumnOrName) -> Column: ... def create_map(*cols: ColumnOrName) -> Column: ... diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 082d61b73242..759553f6a4ac 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -197,6 +197,12 @@ def test_string_functions(self): df.select(getattr(functions, name)("name")).first()[0], df.select(getattr(functions, name)(col("name"))).first()[0]) + def test_octet_length_function(self): + from pyspark.sql.functions import octet_length + df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat']) + actual = df.select(octet_length('cat')).collect() + self.assertEqual([Row(3), Row(4)], actual) + def test_array_contains_function(self): from pyspark.sql.functions import array_contains diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 781a2dd5649e..ef921d28ac06 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2706,6 +2706,14 @@ object functions { StringTrimLeft(e.expr, Literal(trimString)) } + /** + * Calculates the byte length for the specified string column. + * + * @group string_funcs + * @since 3.3.0 + */ + def octet_length(e: Column): Column = withExpr { OctetLength(e.expr) } + /** * Extract a specific group matched by a Java regex, from the specified string column. * If the regex did not match, or the specified group did not match, an empty string is returned. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 00074b01ba6d..212e283748fa 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -486,6 +486,28 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { ) } + test("string / binary octet-length function") { + val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) + .toDF("a", "b", "c", "d", "e", "f") + checkAnswer( + df.select(octet_length($"a"), octet_length($"b")), + Row(3, 4)) + + checkAnswer( + df.selectExpr("octet_length(a)", "octet_length(b)"), + Row(3, 4)) + + checkAnswer( + df.selectExpr("octet_length(c)", "octet_length(d)", "octet_length(e)"), + Row(3, 3, 5) + ) + + checkAnswer( + df.selectExpr("length(f)", "octet_length(f)"), + Row(1, 4) + ) + } + test("initcap function") { val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z") checkAnswer( From 6df46688e67897bfc4487b45b75c1fba413cc396 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Fri, 10 Sep 2021 20:41:54 +0900 Subject: [PATCH 02/22] Implement R octet_length --- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 13 +++++++++++++ R/pkg/R/generics.R | 4 ++++ R/pkg/tests/fulltests/test_sparkSQL.R | 7 +++++++ 4 files changed, 25 insertions(+) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 7fa80853fd23..28323f8cd8ca 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -364,6 +364,7 @@ exportMethods("%<=>%", "not", "nth_value", "ntile", + "octet_length", "otherwise", "over", "overlay", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 62066da10d0d..1781b00b569e 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3621,6 +3621,19 @@ setMethod("ntile", column(jc) }) +#' @details +#' \code{octet_length}: Calculates the byte length for the specified string column. +#' +#' @rdname column_string_functions +#' @aliases octet_length octet_length,Column-method +#' @note length since 3.3.0 +setMethod("octet_length", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc) + column(jc) + }) + #' @details #' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window #' partition. diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 9ebea3f55e69..c52b14518319 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1230,6 +1230,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") }) #' @name NULL setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") }) +#' @rdname column_string_functions +#' @name NULL +setGeneric("octet_length", function(x, ...) { standardGeneric("octet_length") }) + #' @rdname column_string_functions #' @name NULL setGeneric("overlay", function(x, replace, pos, ...) { standardGeneric("overlay") }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index b97c50074993..0086b49f0e44 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1988,6 +1988,13 @@ test_that("string operators", { collect(select(df5, repeat_string(df5$a, -1)))[1, 1], "" ) + + l6 <- list(list("cat"), list("\ud83d\udc08")) + df6 <- createDataFrame(l6) + expect_equal( + collect(select(df6, octet_length(df6$"_1")))[,1], + c(3,4) + ) }) test_that("date functions on a DataFrame", { From eb2456b26e2ea08b8375117617e3e75ea78fbeb7 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Mon, 13 Sep 2021 13:00:01 +0900 Subject: [PATCH 03/22] Implement bit-length functions --- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 13 +++++++ R/pkg/R/generics.R | 4 +++ R/pkg/tests/fulltests/test_sparkSQL.R | 4 +++ python/pyspark/sql/functions.py | 20 +++++++++++ python/pyspark/sql/functions.pyi | 1 + python/pyspark/sql/tests/test_functions.py | 6 ++++ .../org/apache/spark/sql/functions.scala | 8 +++++ .../spark/sql/StringFunctionsSuite.scala | 35 ++++++++++++++++--- 9 files changed, 87 insertions(+), 5 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 28323f8cd8ca..8b8946cf51ca 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -245,6 +245,7 @@ exportMethods("%<=>%", "bin", "bitwise_not", "bitwiseNOT", + "bit_length", "bround", "cast", "cbrt", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 1781b00b569e..35e83d017be9 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -676,6 +676,19 @@ setMethod("bitwiseNOT", bitwise_not(x) }) +#' @details +#' \code{bit_length}: Calculates the bit length for the specified string column. +#' +#' @rdname column_string_functions +#' @aliases bit_length bit_length,Column-method +#' @note length since 3.3.0 +setMethod("bit_length", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc) + column(jc) + }) + #' @details #' \code{cbrt}: Computes the cube-root of the given value. #' diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index c52b14518319..f0767c23bd6e 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -892,6 +892,10 @@ setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") }) #' @name NULL setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") }) +#' @rdname column_string_functions +#' @name NULL +setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") }) + #' @rdname column_math_functions #' @name NULL setGeneric("bround", function(x, ...) { standardGeneric("bround") }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 0086b49f0e44..c95a3c056aaf 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1995,6 +1995,10 @@ test_that("string operators", { collect(select(df6, octet_length(df6$"_1")))[,1], c(3,4) ) + expect_equal( + collect(select(df6, bit_length(df6$"_1")))[,1], + c(24,32) + ) }) test_that("date functions on a DataFrame", { diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 6ee3e4e253db..d6bf872551be 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -3118,6 +3118,26 @@ def octet_length(col): return _invoke_function_over_column("octet_length", col) +def bit_length(col): + """ + .. versionadded:: 3.3.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + Source column or strings + Returns + ------- + :class:`~pyspark.sql.Column` + Bit length of the col + Examples + ------- + >>> from pyspark.sql.functions import bit_length + >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(bit_length('cat')).collect() + [Row(bit_length(cat)=24), Row(bit_length(cat)=32)] + """ + return _invoke_function_over_column("bit_length", col) + + def translate(srcCol, matching, replace): """A function translate any character in the `srcCol` by a character in `matching`. The characters in `replace` is corresponding to the characters in `matching`. diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi index f29d252c146a..1a0a61efb84f 100644 --- a/python/pyspark/sql/functions.pyi +++ b/python/pyspark/sql/functions.pyi @@ -175,6 +175,7 @@ def hex(col: ColumnOrName) -> Column: ... def unhex(col: ColumnOrName) -> Column: ... def length(col: ColumnOrName) -> Column: ... def octet_length(col: ColumnOrName) -> Column: ... +def bit_length(col: ColumnOrName) -> Column: ... def translate(srcCol: ColumnOrName, matching: str, replace: str) -> Column: ... def map_from_arrays(col1: ColumnOrName, col2: ColumnOrName) -> Column: ... def create_map(*cols: ColumnOrName) -> Column: ... diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 759553f6a4ac..299d14962c88 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -203,6 +203,12 @@ def test_octet_length_function(self): actual = df.select(octet_length('cat')).collect() self.assertEqual([Row(3), Row(4)], actual) + def test_bit_length_function(self): + from pyspark.sql.functions import bit_length + df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat']) + actual = df.select(bit_length('cat')).collect() + self.assertEqual([Row(24), Row(32)], actual) + def test_array_contains_function(self): from pyspark.sql.functions import array_contains diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index ef921d28ac06..2d12d5f0aacc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2541,6 +2541,14 @@ object functions { */ def base64(e: Column): Column = withExpr { Base64(e.expr) } + /** + * Calculates the bit length for the specified string column. + * + * @group string_funcs + * @since 3.3.0 + */ + def bit_length(e: Column): Column = withExpr { BitLength(e.expr) } + /** * Concatenates multiple input string columns together into a single string column, * using the given separator. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 212e283748fa..353d37dda6c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -486,28 +486,53 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { ) } - test("string / binary octet-length function") { + test("octet-length function") { val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) .toDF("a", "b", "c", "d", "e", "f") + // string and binary input checkAnswer( df.select(octet_length($"a"), octet_length($"b")), Row(3, 4)) - + // string and binary input checkAnswer( df.selectExpr("octet_length(a)", "octet_length(b)"), Row(3, 4)) - + // integer, float and double input checkAnswer( df.selectExpr("octet_length(c)", "octet_length(d)", "octet_length(e)"), Row(3, 3, 5) ) + // multi-byte character input + checkAnswer( + df.selectExpr("octet_length(f)"), + Row(4) + ) + } + test("bit-length function") { + val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) + .toDF("a", "b", "c", "d", "e", "f") + // string and binary input + checkAnswer( + df.select(bit_length($"a"), bit_length($"b")), + Row(24, 32)) + // string and binary input + checkAnswer( + df.selectExpr("bit_length(a)", "bit_length(b)"), + Row(24, 32)) + // integer, float and double input checkAnswer( - df.selectExpr("length(f)", "octet_length(f)"), - Row(1, 4) + df.selectExpr("bit_length(c)", "bit_length(d)", "bit_length(e)"), + Row(24, 24, 40) + ) + // multi-byte character input + checkAnswer( + df.selectExpr("bit_length(f)"), + Row(32) ) } + test("initcap function") { val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z") checkAnswer( From 3e149faeb6bb5dce20f41e1feb821367541dff08 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Mon, 13 Sep 2021 13:40:51 +0900 Subject: [PATCH 04/22] Rearange the order of octet_lengh --- R/pkg/NAMESPACE | 2 +- R/pkg/R/functions.R | 52 ++++++++++++++++++++++----------------------- R/pkg/R/generics.R | 8 +++---- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 8b8946cf51ca..686a49e44dfa 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -243,9 +243,9 @@ exportMethods("%<=>%", "base64", "between", "bin", + "bit_length", "bitwise_not", "bitwiseNOT", - "bit_length", "bround", "cast", "cbrt", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 35e83d017be9..22bf27c21dbb 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -646,6 +646,19 @@ setMethod("bin", column(jc) }) +#' @details +#' \code{bit_length}: Calculates the bit length for the specified string column. +#' +#' @rdname column_string_functions +#' @aliases bit_length bit_length,Column-method +#' @note length since 3.3.0 +setMethod("bit_length", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc) + column(jc) + }) + #' @details #' \code{bitwise_not}: Computes bitwise NOT. #' @@ -676,19 +689,6 @@ setMethod("bitwiseNOT", bitwise_not(x) }) -#' @details -#' \code{bit_length}: Calculates the bit length for the specified string column. -#' -#' @rdname column_string_functions -#' @aliases bit_length bit_length,Column-method -#' @note length since 3.3.0 -setMethod("bit_length", - signature(x = "Column"), - function(x) { - jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc) - column(jc) - }) - #' @details #' \code{cbrt}: Computes the cube-root of the given value. #' @@ -1582,6 +1582,19 @@ setMethod("negate", column(jc) }) +#' @details +#' \code{octet_length}: Calculates the byte length for the specified string column. +#' +#' @rdname column_string_functions +#' @aliases octet_length octet_length,Column-method +#' @note length since 3.3.0 +setMethod("octet_length", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc) + column(jc) + }) + #' @details #' \code{overlay}: Overlay the specified portion of \code{x} with \code{replace}, #' starting from byte position \code{pos} of \code{src} and proceeding for @@ -3634,19 +3647,6 @@ setMethod("ntile", column(jc) }) -#' @details -#' \code{octet_length}: Calculates the byte length for the specified string column. -#' -#' @rdname column_string_functions -#' @aliases octet_length octet_length,Column-method -#' @note length since 3.3.0 -setMethod("octet_length", - signature(x = "Column"), - function(x) { - jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc) - column(jc) - }) - #' @details #' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window #' partition. diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index f0767c23bd6e..1abde65391e9 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -884,6 +884,10 @@ setGeneric("base64", function(x) { standardGeneric("base64") }) #' @name NULL setGeneric("bin", function(x) { standardGeneric("bin") }) +#' @rdname column_string_functions +#' @name NULL +setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") }) + #' @rdname column_nonaggregate_functions #' @name NULL setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") }) @@ -892,10 +896,6 @@ setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") }) #' @name NULL setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") }) -#' @rdname column_string_functions -#' @name NULL -setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") }) - #' @rdname column_math_functions #' @name NULL setGeneric("bround", function(x, ...) { standardGeneric("bround") }) From ff2f0313dcfc6fbe339d4bb3c67854917ef4a062 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Mon, 13 Sep 2021 16:37:55 +0900 Subject: [PATCH 05/22] Disable scalastyle to use non-ascii charactor on tests --- .../scala/org/apache/spark/sql/StringFunctionsSuite.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 353d37dda6c1..da2f4fcac71f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -487,6 +487,8 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { } test("octet-length function") { + // scalastyle:off + // non ascii characters are not allowed in the code, so we disable the scalastyle here. val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) .toDF("a", "b", "c", "d", "e", "f") // string and binary input @@ -507,9 +509,12 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { df.selectExpr("octet_length(f)"), Row(4) ) + // scalastyle:on } test("bit-length function") { + // scalastyle:off + // non ascii characters are not allowed in the code, so we disable the scalastyle here. val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) .toDF("a", "b", "c", "d", "e", "f") // string and binary input @@ -530,6 +535,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { df.selectExpr("bit_length(f)"), Row(32) ) + // scalastyle:on } From 5b9c98b6bada969d1fb844dcb9a2782448ce96fe Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Mon, 13 Sep 2021 17:30:55 +0900 Subject: [PATCH 06/22] Insert newline to python docstrings to keep pylint rules --- python/pyspark/sql/functions.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index d6bf872551be..19709792130a 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -3112,7 +3112,8 @@ def octet_length(col): Examples ------- >>> from pyspark.sql.functions import octet_length - >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(octet_length('cat')).collect() + >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) \ + .select(octet_length('cat')).collect() [Row(octet_length(cat)=3), Row(octet_length(cat)=4)] """ return _invoke_function_over_column("octet_length", col) @@ -3132,7 +3133,8 @@ def bit_length(col): Examples ------- >>> from pyspark.sql.functions import bit_length - >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(bit_length('cat')).collect() + >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) \ + .select(bit_length('cat')).collect() [Row(bit_length(cat)=24), Row(bit_length(cat)=32)] """ return _invoke_function_over_column("bit_length", col) From 848f2249d58adb11a44e82772db71181fb82bef8 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Tue, 14 Sep 2021 12:18:35 +0900 Subject: [PATCH 07/22] add space after comma to pass lint-R --- R/pkg/tests/fulltests/test_sparkSQL.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index c95a3c056aaf..f0cb2745e1d6 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1992,12 +1992,12 @@ test_that("string operators", { l6 <- list(list("cat"), list("\ud83d\udc08")) df6 <- createDataFrame(l6) expect_equal( - collect(select(df6, octet_length(df6$"_1")))[,1], - c(3,4) + collect(select(df6, octet_length(df6$"_1")))[, 1], + c(3, 4) ) expect_equal( - collect(select(df6, bit_length(df6$"_1")))[,1], - c(24,32) + collect(select(df6, bit_length(df6$"_1")))[, 1], + c(24, 32) ) }) From 4edc918860ef3e1784de026b8bb0c6951ea428a0 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Fri, 10 Sep 2021 18:23:41 +0900 Subject: [PATCH 08/22] Implement scala/python octet_length --- python/pyspark/sql/functions.py | 20 +++++++++++++++++ python/pyspark/sql/functions.pyi | 1 + python/pyspark/sql/tests/test_functions.py | 6 +++++ .../org/apache/spark/sql/functions.scala | 8 +++++++ .../spark/sql/StringFunctionsSuite.scala | 22 +++++++++++++++++++ 5 files changed, 57 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index e418c0d11f8f..6ee3e4e253db 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -3098,6 +3098,26 @@ def length(col): return Column(sc._jvm.functions.length(_to_java_column(col))) +def octet_length(col): + """ + .. versionadded:: 3.3.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + Source column or strings + Returns + ------- + :class:`~pyspark.sql.Column` + Byte length of the col + Examples + ------- + >>> from pyspark.sql.functions import octet_length + >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(octet_length('cat')).collect() + [Row(octet_length(cat)=3), Row(octet_length(cat)=4)] + """ + return _invoke_function_over_column("octet_length", col) + + def translate(srcCol, matching, replace): """A function translate any character in the `srcCol` by a character in `matching`. The characters in `replace` is corresponding to the characters in `matching`. diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi index 143fa133f4fe..f29d252c146a 100644 --- a/python/pyspark/sql/functions.pyi +++ b/python/pyspark/sql/functions.pyi @@ -174,6 +174,7 @@ def bin(col: ColumnOrName) -> Column: ... def hex(col: ColumnOrName) -> Column: ... def unhex(col: ColumnOrName) -> Column: ... def length(col: ColumnOrName) -> Column: ... +def octet_length(col: ColumnOrName) -> Column: ... def translate(srcCol: ColumnOrName, matching: str, replace: str) -> Column: ... def map_from_arrays(col1: ColumnOrName, col2: ColumnOrName) -> Column: ... def create_map(*cols: ColumnOrName) -> Column: ... diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 082d61b73242..759553f6a4ac 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -197,6 +197,12 @@ def test_string_functions(self): df.select(getattr(functions, name)("name")).first()[0], df.select(getattr(functions, name)(col("name"))).first()[0]) + def test_octet_length_function(self): + from pyspark.sql.functions import octet_length + df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat']) + actual = df.select(octet_length('cat')).collect() + self.assertEqual([Row(3), Row(4)], actual) + def test_array_contains_function(self): from pyspark.sql.functions import array_contains diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 781a2dd5649e..ef921d28ac06 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2706,6 +2706,14 @@ object functions { StringTrimLeft(e.expr, Literal(trimString)) } + /** + * Calculates the byte length for the specified string column. + * + * @group string_funcs + * @since 3.3.0 + */ + def octet_length(e: Column): Column = withExpr { OctetLength(e.expr) } + /** * Extract a specific group matched by a Java regex, from the specified string column. * If the regex did not match, or the specified group did not match, an empty string is returned. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 00074b01ba6d..212e283748fa 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -486,6 +486,28 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { ) } + test("string / binary octet-length function") { + val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) + .toDF("a", "b", "c", "d", "e", "f") + checkAnswer( + df.select(octet_length($"a"), octet_length($"b")), + Row(3, 4)) + + checkAnswer( + df.selectExpr("octet_length(a)", "octet_length(b)"), + Row(3, 4)) + + checkAnswer( + df.selectExpr("octet_length(c)", "octet_length(d)", "octet_length(e)"), + Row(3, 3, 5) + ) + + checkAnswer( + df.selectExpr("length(f)", "octet_length(f)"), + Row(1, 4) + ) + } + test("initcap function") { val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z") checkAnswer( From 5370a5134cfc8508684920e93ca0ee334a7c113b Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Fri, 10 Sep 2021 20:41:54 +0900 Subject: [PATCH 09/22] Implement R octet_length --- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 13 +++++++++++++ R/pkg/R/generics.R | 4 ++++ R/pkg/tests/fulltests/test_sparkSQL.R | 7 +++++++ 4 files changed, 25 insertions(+) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 7fa80853fd23..28323f8cd8ca 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -364,6 +364,7 @@ exportMethods("%<=>%", "not", "nth_value", "ntile", + "octet_length", "otherwise", "over", "overlay", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 62066da10d0d..1781b00b569e 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3621,6 +3621,19 @@ setMethod("ntile", column(jc) }) +#' @details +#' \code{octet_length}: Calculates the byte length for the specified string column. +#' +#' @rdname column_string_functions +#' @aliases octet_length octet_length,Column-method +#' @note length since 3.3.0 +setMethod("octet_length", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc) + column(jc) + }) + #' @details #' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window #' partition. diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 9ebea3f55e69..c52b14518319 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1230,6 +1230,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") }) #' @name NULL setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") }) +#' @rdname column_string_functions +#' @name NULL +setGeneric("octet_length", function(x, ...) { standardGeneric("octet_length") }) + #' @rdname column_string_functions #' @name NULL setGeneric("overlay", function(x, replace, pos, ...) { standardGeneric("overlay") }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index b97c50074993..0086b49f0e44 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1988,6 +1988,13 @@ test_that("string operators", { collect(select(df5, repeat_string(df5$a, -1)))[1, 1], "" ) + + l6 <- list(list("cat"), list("\ud83d\udc08")) + df6 <- createDataFrame(l6) + expect_equal( + collect(select(df6, octet_length(df6$"_1")))[,1], + c(3,4) + ) }) test_that("date functions on a DataFrame", { From 72c939859c538c80b421833e9c1b567c35390df8 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Mon, 13 Sep 2021 13:00:01 +0900 Subject: [PATCH 10/22] Implement bit-length functions --- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 13 +++++++ R/pkg/R/generics.R | 4 +++ R/pkg/tests/fulltests/test_sparkSQL.R | 4 +++ python/pyspark/sql/functions.py | 20 +++++++++++ python/pyspark/sql/functions.pyi | 1 + python/pyspark/sql/tests/test_functions.py | 6 ++++ .../org/apache/spark/sql/functions.scala | 8 +++++ .../spark/sql/StringFunctionsSuite.scala | 35 ++++++++++++++++--- 9 files changed, 87 insertions(+), 5 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 28323f8cd8ca..8b8946cf51ca 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -245,6 +245,7 @@ exportMethods("%<=>%", "bin", "bitwise_not", "bitwiseNOT", + "bit_length", "bround", "cast", "cbrt", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 1781b00b569e..35e83d017be9 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -676,6 +676,19 @@ setMethod("bitwiseNOT", bitwise_not(x) }) +#' @details +#' \code{bit_length}: Calculates the bit length for the specified string column. +#' +#' @rdname column_string_functions +#' @aliases bit_length bit_length,Column-method +#' @note length since 3.3.0 +setMethod("bit_length", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc) + column(jc) + }) + #' @details #' \code{cbrt}: Computes the cube-root of the given value. #' diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index c52b14518319..f0767c23bd6e 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -892,6 +892,10 @@ setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") }) #' @name NULL setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") }) +#' @rdname column_string_functions +#' @name NULL +setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") }) + #' @rdname column_math_functions #' @name NULL setGeneric("bround", function(x, ...) { standardGeneric("bround") }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 0086b49f0e44..c95a3c056aaf 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1995,6 +1995,10 @@ test_that("string operators", { collect(select(df6, octet_length(df6$"_1")))[,1], c(3,4) ) + expect_equal( + collect(select(df6, bit_length(df6$"_1")))[,1], + c(24,32) + ) }) test_that("date functions on a DataFrame", { diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 6ee3e4e253db..d6bf872551be 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -3118,6 +3118,26 @@ def octet_length(col): return _invoke_function_over_column("octet_length", col) +def bit_length(col): + """ + .. versionadded:: 3.3.0 + Parameters + ---------- + col : :class:`~pyspark.sql.Column` or str + Source column or strings + Returns + ------- + :class:`~pyspark.sql.Column` + Bit length of the col + Examples + ------- + >>> from pyspark.sql.functions import bit_length + >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(bit_length('cat')).collect() + [Row(bit_length(cat)=24), Row(bit_length(cat)=32)] + """ + return _invoke_function_over_column("bit_length", col) + + def translate(srcCol, matching, replace): """A function translate any character in the `srcCol` by a character in `matching`. The characters in `replace` is corresponding to the characters in `matching`. diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi index f29d252c146a..1a0a61efb84f 100644 --- a/python/pyspark/sql/functions.pyi +++ b/python/pyspark/sql/functions.pyi @@ -175,6 +175,7 @@ def hex(col: ColumnOrName) -> Column: ... def unhex(col: ColumnOrName) -> Column: ... def length(col: ColumnOrName) -> Column: ... def octet_length(col: ColumnOrName) -> Column: ... +def bit_length(col: ColumnOrName) -> Column: ... def translate(srcCol: ColumnOrName, matching: str, replace: str) -> Column: ... def map_from_arrays(col1: ColumnOrName, col2: ColumnOrName) -> Column: ... def create_map(*cols: ColumnOrName) -> Column: ... diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 759553f6a4ac..299d14962c88 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -203,6 +203,12 @@ def test_octet_length_function(self): actual = df.select(octet_length('cat')).collect() self.assertEqual([Row(3), Row(4)], actual) + def test_bit_length_function(self): + from pyspark.sql.functions import bit_length + df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat']) + actual = df.select(bit_length('cat')).collect() + self.assertEqual([Row(24), Row(32)], actual) + def test_array_contains_function(self): from pyspark.sql.functions import array_contains diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index ef921d28ac06..2d12d5f0aacc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2541,6 +2541,14 @@ object functions { */ def base64(e: Column): Column = withExpr { Base64(e.expr) } + /** + * Calculates the bit length for the specified string column. + * + * @group string_funcs + * @since 3.3.0 + */ + def bit_length(e: Column): Column = withExpr { BitLength(e.expr) } + /** * Concatenates multiple input string columns together into a single string column, * using the given separator. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 212e283748fa..353d37dda6c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -486,28 +486,53 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { ) } - test("string / binary octet-length function") { + test("octet-length function") { val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) .toDF("a", "b", "c", "d", "e", "f") + // string and binary input checkAnswer( df.select(octet_length($"a"), octet_length($"b")), Row(3, 4)) - + // string and binary input checkAnswer( df.selectExpr("octet_length(a)", "octet_length(b)"), Row(3, 4)) - + // integer, float and double input checkAnswer( df.selectExpr("octet_length(c)", "octet_length(d)", "octet_length(e)"), Row(3, 3, 5) ) + // multi-byte character input + checkAnswer( + df.selectExpr("octet_length(f)"), + Row(4) + ) + } + test("bit-length function") { + val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) + .toDF("a", "b", "c", "d", "e", "f") + // string and binary input + checkAnswer( + df.select(bit_length($"a"), bit_length($"b")), + Row(24, 32)) + // string and binary input + checkAnswer( + df.selectExpr("bit_length(a)", "bit_length(b)"), + Row(24, 32)) + // integer, float and double input checkAnswer( - df.selectExpr("length(f)", "octet_length(f)"), - Row(1, 4) + df.selectExpr("bit_length(c)", "bit_length(d)", "bit_length(e)"), + Row(24, 24, 40) + ) + // multi-byte character input + checkAnswer( + df.selectExpr("bit_length(f)"), + Row(32) ) } + test("initcap function") { val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z") checkAnswer( From 761f8b6f3d4536d6256d295fbdf7b0a949b00647 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Mon, 13 Sep 2021 13:40:51 +0900 Subject: [PATCH 11/22] Rearange the order of octet_lengh --- R/pkg/NAMESPACE | 2 +- R/pkg/R/functions.R | 52 ++++++++++++++++++++++----------------------- R/pkg/R/generics.R | 8 +++---- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 8b8946cf51ca..686a49e44dfa 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -243,9 +243,9 @@ exportMethods("%<=>%", "base64", "between", "bin", + "bit_length", "bitwise_not", "bitwiseNOT", - "bit_length", "bround", "cast", "cbrt", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 35e83d017be9..22bf27c21dbb 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -646,6 +646,19 @@ setMethod("bin", column(jc) }) +#' @details +#' \code{bit_length}: Calculates the bit length for the specified string column. +#' +#' @rdname column_string_functions +#' @aliases bit_length bit_length,Column-method +#' @note length since 3.3.0 +setMethod("bit_length", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc) + column(jc) + }) + #' @details #' \code{bitwise_not}: Computes bitwise NOT. #' @@ -676,19 +689,6 @@ setMethod("bitwiseNOT", bitwise_not(x) }) -#' @details -#' \code{bit_length}: Calculates the bit length for the specified string column. -#' -#' @rdname column_string_functions -#' @aliases bit_length bit_length,Column-method -#' @note length since 3.3.0 -setMethod("bit_length", - signature(x = "Column"), - function(x) { - jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc) - column(jc) - }) - #' @details #' \code{cbrt}: Computes the cube-root of the given value. #' @@ -1582,6 +1582,19 @@ setMethod("negate", column(jc) }) +#' @details +#' \code{octet_length}: Calculates the byte length for the specified string column. +#' +#' @rdname column_string_functions +#' @aliases octet_length octet_length,Column-method +#' @note length since 3.3.0 +setMethod("octet_length", + signature(x = "Column"), + function(x) { + jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc) + column(jc) + }) + #' @details #' \code{overlay}: Overlay the specified portion of \code{x} with \code{replace}, #' starting from byte position \code{pos} of \code{src} and proceeding for @@ -3634,19 +3647,6 @@ setMethod("ntile", column(jc) }) -#' @details -#' \code{octet_length}: Calculates the byte length for the specified string column. -#' -#' @rdname column_string_functions -#' @aliases octet_length octet_length,Column-method -#' @note length since 3.3.0 -setMethod("octet_length", - signature(x = "Column"), - function(x) { - jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc) - column(jc) - }) - #' @details #' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window #' partition. diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index f0767c23bd6e..1abde65391e9 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -884,6 +884,10 @@ setGeneric("base64", function(x) { standardGeneric("base64") }) #' @name NULL setGeneric("bin", function(x) { standardGeneric("bin") }) +#' @rdname column_string_functions +#' @name NULL +setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") }) + #' @rdname column_nonaggregate_functions #' @name NULL setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") }) @@ -892,10 +896,6 @@ setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") }) #' @name NULL setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") }) -#' @rdname column_string_functions -#' @name NULL -setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") }) - #' @rdname column_math_functions #' @name NULL setGeneric("bround", function(x, ...) { standardGeneric("bround") }) From f7ace95d3989941e12026e854850a4c115128829 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Mon, 13 Sep 2021 16:37:55 +0900 Subject: [PATCH 12/22] Disable scalastyle to use non-ascii charactor on tests --- .../scala/org/apache/spark/sql/StringFunctionsSuite.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 353d37dda6c1..da2f4fcac71f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -487,6 +487,8 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { } test("octet-length function") { + // scalastyle:off + // non ascii characters are not allowed in the code, so we disable the scalastyle here. val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) .toDF("a", "b", "c", "d", "e", "f") // string and binary input @@ -507,9 +509,12 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { df.selectExpr("octet_length(f)"), Row(4) ) + // scalastyle:on } test("bit-length function") { + // scalastyle:off + // non ascii characters are not allowed in the code, so we disable the scalastyle here. val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) .toDF("a", "b", "c", "d", "e", "f") // string and binary input @@ -530,6 +535,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { df.selectExpr("bit_length(f)"), Row(32) ) + // scalastyle:on } From 2ed29f5a6981fd51568ce18eace234c2c4ea41f8 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Mon, 13 Sep 2021 17:30:55 +0900 Subject: [PATCH 13/22] Insert newline to python docstrings to keep pylint rules --- python/pyspark/sql/functions.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index d6bf872551be..19709792130a 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -3112,7 +3112,8 @@ def octet_length(col): Examples ------- >>> from pyspark.sql.functions import octet_length - >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(octet_length('cat')).collect() + >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) \ + .select(octet_length('cat')).collect() [Row(octet_length(cat)=3), Row(octet_length(cat)=4)] """ return _invoke_function_over_column("octet_length", col) @@ -3132,7 +3133,8 @@ def bit_length(col): Examples ------- >>> from pyspark.sql.functions import bit_length - >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(bit_length('cat')).collect() + >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) \ + .select(bit_length('cat')).collect() [Row(bit_length(cat)=24), Row(bit_length(cat)=32)] """ return _invoke_function_over_column("bit_length", col) From 08580015c405b3e7e3dbbb3dda504d540ac33b22 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Tue, 14 Sep 2021 12:18:35 +0900 Subject: [PATCH 14/22] add space after comma to pass lint-R --- R/pkg/tests/fulltests/test_sparkSQL.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index c95a3c056aaf..f0cb2745e1d6 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1992,12 +1992,12 @@ test_that("string operators", { l6 <- list(list("cat"), list("\ud83d\udc08")) df6 <- createDataFrame(l6) expect_equal( - collect(select(df6, octet_length(df6$"_1")))[,1], - c(3,4) + collect(select(df6, octet_length(df6$"_1")))[, 1], + c(3, 4) ) expect_equal( - collect(select(df6, bit_length(df6$"_1")))[,1], - c(24,32) + collect(select(df6, bit_length(df6$"_1")))[, 1], + c(24, 32) ) }) From 327d7d561f5c1bb330484f0eb224548c206a2f8d Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Tue, 14 Sep 2021 18:42:12 +0900 Subject: [PATCH 15/22] Delete unnecessary line in the scala test. --- .../test/scala/org/apache/spark/sql/StringFunctionsSuite.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index da2f4fcac71f..e45cc8173a1c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -538,7 +538,6 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { // scalastyle:on } - test("initcap function") { val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z") checkAnswer( From 387a92f3223fa9080d25ca4f1c3a2fc2303c7f08 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Tue, 14 Sep 2021 18:59:20 +0900 Subject: [PATCH 16/22] Add a short description of the test --- python/pyspark/sql/tests/test_functions.py | 2 ++ .../scala/org/apache/spark/sql/StringFunctionsSuite.scala | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 299d14962c88..f8e42303f93a 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -198,12 +198,14 @@ def test_string_functions(self): df.select(getattr(functions, name)(col("name"))).first()[0]) def test_octet_length_function(self): + # SPARK-36751: add octet/bit length api for python from pyspark.sql.functions import octet_length df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat']) actual = df.select(octet_length('cat')).collect() self.assertEqual([Row(3), Row(4)], actual) def test_bit_length_function(self): + # SPARK-36751: add octet/bit length api for python from pyspark.sql.functions import bit_length df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat']) actual = df.select(bit_length('cat')).collect() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index e45cc8173a1c..3efcc7f694e0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -486,7 +486,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { ) } - test("octet-length function") { + test("SPARK-36751: add octet/bit length api for scala") { // scalastyle:off // non ascii characters are not allowed in the code, so we disable the scalastyle here. val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) @@ -512,7 +512,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { // scalastyle:on } - test("bit-length function") { + test("SPARK-36751: add octet/bit length api for scala") { // scalastyle:off // non ascii characters are not allowed in the code, so we disable the scalastyle here. val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) From 70a1217f37b9867190f97a4ade667eb76b89e940 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Tue, 14 Sep 2021 19:11:01 +0900 Subject: [PATCH 17/22] Add a short description to docstrings --- python/pyspark/sql/functions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 19709792130a..a1eae3ead81f 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -3100,6 +3100,8 @@ def length(col): def octet_length(col): """ + Calculates the byte length for the specified string column. + .. versionadded:: 3.3.0 Parameters ---------- @@ -3121,6 +3123,8 @@ def octet_length(col): def bit_length(col): """ + Calculates the bit length for the specified string column. + .. versionadded:: 3.3.0 Parameters ---------- From 3a0733747c3db00ae4e5fcfd3432be3bf8ae9325 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Tue, 14 Sep 2021 19:15:29 +0900 Subject: [PATCH 18/22] Move import to the top of the file --- python/pyspark/sql/tests/test_functions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index f8e42303f93a..c69e96bddb9e 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -23,7 +23,7 @@ from pyspark.sql import Row, Window from pyspark.sql.functions import udf, input_file_name, col, percentile_approx, \ lit, assert_true, sum_distinct, sumDistinct, shiftleft, shiftLeft, shiftRight, \ - shiftright, shiftrightunsigned, shiftRightUnsigned + shiftright, shiftrightunsigned, shiftRightUnsigned, octet_length, bit_length from pyspark.testing.sqlutils import ReusedSQLTestCase @@ -199,14 +199,12 @@ def test_string_functions(self): def test_octet_length_function(self): # SPARK-36751: add octet/bit length api for python - from pyspark.sql.functions import octet_length df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat']) actual = df.select(octet_length('cat')).collect() self.assertEqual([Row(3), Row(4)], actual) def test_bit_length_function(self): # SPARK-36751: add octet/bit length api for python - from pyspark.sql.functions import bit_length df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat']) actual = df.select(bit_length('cat')).collect() self.assertEqual([Row(24), Row(32)], actual) From 545fbe2a41cc209171b153111bd58e535ed200ec Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Tue, 14 Sep 2021 21:08:12 +0900 Subject: [PATCH 19/22] differentiate octet_length and bit_length test name --- .../scala/org/apache/spark/sql/StringFunctionsSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 3efcc7f694e0..30a6600c3176 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -486,7 +486,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { ) } - test("SPARK-36751: add octet/bit length api for scala") { + test("SPARK-36751: add octet length api for scala") { // scalastyle:off // non ascii characters are not allowed in the code, so we disable the scalastyle here. val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) @@ -512,7 +512,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { // scalastyle:on } - test("SPARK-36751: add octet/bit length api for scala") { + test("SPARK-36751: add bit length api for scala") { // scalastyle:off // non ascii characters are not allowed in the code, so we disable the scalastyle here. val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08")) From afa1700ee3f4e3a0ba0f0a685c091e45f1631a09 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Wed, 15 Sep 2021 10:06:29 +0900 Subject: [PATCH 20/22] Formatting decstrings --- R/pkg/R/functions.R | 4 ++-- python/pyspark/sql/functions.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 22bf27c21dbb..f0768c7e1d83 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -647,7 +647,7 @@ setMethod("bin", }) #' @details -#' \code{bit_length}: Calculates the bit length for the specified string column. +#' \code{bit_length}: Calculates the bit length for the specified string column. #' #' @rdname column_string_functions #' @aliases bit_length bit_length,Column-method @@ -1583,7 +1583,7 @@ setMethod("negate", }) #' @details -#' \code{octet_length}: Calculates the byte length for the specified string column. +#' \code{octet_length}: Calculates the byte length for the specified string column. #' #' @rdname column_string_functions #' @aliases octet_length octet_length,Column-method diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index a1eae3ead81f..105727e999bd 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -3103,14 +3103,17 @@ def octet_length(col): Calculates the byte length for the specified string column. .. versionadded:: 3.3.0 + Parameters ---------- col : :class:`~pyspark.sql.Column` or str Source column or strings + Returns ------- :class:`~pyspark.sql.Column` Byte length of the col + Examples ------- >>> from pyspark.sql.functions import octet_length @@ -3126,14 +3129,17 @@ def bit_length(col): Calculates the bit length for the specified string column. .. versionadded:: 3.3.0 + Parameters ---------- col : :class:`~pyspark.sql.Column` or str Source column or strings + Returns ------- :class:`~pyspark.sql.Column` Bit length of the col + Examples ------- >>> from pyspark.sql.functions import bit_length From 127da47343447c1e3e48c8337187536070dec689 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Wed, 15 Sep 2021 10:35:00 +0900 Subject: [PATCH 21/22] differentiate test comments --- python/pyspark/sql/tests/test_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index c69e96bddb9e..00a2660492a0 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -198,13 +198,13 @@ def test_string_functions(self): df.select(getattr(functions, name)(col("name"))).first()[0]) def test_octet_length_function(self): - # SPARK-36751: add octet/bit length api for python + # SPARK-36751: add octet length api for python df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat']) actual = df.select(octet_length('cat')).collect() self.assertEqual([Row(3), Row(4)], actual) def test_bit_length_function(self): - # SPARK-36751: add octet/bit length api for python + # SPARK-36751: add bit length api for python df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat']) actual = df.select(bit_length('cat')).collect() self.assertEqual([Row(24), Row(32)], actual) From 41656f43bfa41431035bf9023ee6035f856570b3 Mon Sep 17 00:00:00 2001 From: Leona Yoda Date: Wed, 15 Sep 2021 10:42:18 +0900 Subject: [PATCH 22/22] Add bit/octet length to the api document --- python/docs/source/reference/pyspark.sql.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/docs/source/reference/pyspark.sql.rst b/python/docs/source/reference/pyspark.sql.rst index 605a1504e498..cc9c4295567c 100644 --- a/python/docs/source/reference/pyspark.sql.rst +++ b/python/docs/source/reference/pyspark.sql.rst @@ -367,6 +367,7 @@ Functions avg base64 bin + bit_length bitwise_not bitwiseNOT broadcast @@ -483,6 +484,7 @@ Functions next_day nth_value ntile + octet_length overlay pandas_udf percent_rank