From 1df379ba2ca8b0339aea6cd5ee0edbe551b23e28 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Fri, 10 Sep 2021 18:23:41 +0900
Subject: [PATCH 01/22] Implement scala/python octet_length

---
 python/pyspark/sql/functions.py               | 20 +++++++++++++++++
 python/pyspark/sql/functions.pyi              |  1 +
 python/pyspark/sql/tests/test_functions.py    |  6 +++++
 .../org/apache/spark/sql/functions.scala      |  8 +++++++
 .../spark/sql/StringFunctionsSuite.scala      | 22 +++++++++++++++++++
 5 files changed, 57 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index e418c0d11f8f..6ee3e4e253db 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -3098,6 +3098,26 @@ def length(col):
     return Column(sc._jvm.functions.length(_to_java_column(col)))
 
 
+def octet_length(col):
+    """
+    .. versionadded:: 3.3.0
+    Parameters
+    ----------
+    col : :class:`~pyspark.sql.Column` or str
+        Source column or strings
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        Byte length of the col
+    Examples
+    -------
+    >>> from pyspark.sql.functions import octet_length
+    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(octet_length('cat')).collect()
+        [Row(octet_length(cat)=3), Row(octet_length(cat)=4)]
+    """
+    return _invoke_function_over_column("octet_length", col)
+
+
 def translate(srcCol, matching, replace):
     """A function translate any character in the `srcCol` by a character in `matching`.
     The characters in `replace` is corresponding to the characters in `matching`.
diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi
index 143fa133f4fe..f29d252c146a 100644
--- a/python/pyspark/sql/functions.pyi
+++ b/python/pyspark/sql/functions.pyi
@@ -174,6 +174,7 @@ def bin(col: ColumnOrName) -> Column: ...
 def hex(col: ColumnOrName) -> Column: ...
 def unhex(col: ColumnOrName) -> Column: ...
 def length(col: ColumnOrName) -> Column: ...
+def octet_length(col: ColumnOrName) -> Column: ...
 def translate(srcCol: ColumnOrName, matching: str, replace: str) -> Column: ...
 def map_from_arrays(col1: ColumnOrName, col2: ColumnOrName) -> Column: ...
 def create_map(*cols: ColumnOrName) -> Column: ...
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index 082d61b73242..759553f6a4ac 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -197,6 +197,12 @@ def test_string_functions(self):
                 df.select(getattr(functions, name)("name")).first()[0],
                 df.select(getattr(functions, name)(col("name"))).first()[0])
 
+    def test_octet_length_function(self):
+        from pyspark.sql.functions import octet_length
+        df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat'])
+        actual = df.select(octet_length('cat')).collect()
+        self.assertEqual([Row(3), Row(4)], actual)
+
     def test_array_contains_function(self):
         from pyspark.sql.functions import array_contains
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 781a2dd5649e..ef921d28ac06 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2706,6 +2706,14 @@ object functions {
     StringTrimLeft(e.expr, Literal(trimString))
   }
 
+  /**
+   * Calculates the byte length for the specified string column.
+   *
+   * @group string_funcs
+   * @since 3.3.0
+   */
+  def octet_length(e: Column): Column = withExpr { OctetLength(e.expr) }
+
   /**
    * Extract a specific group matched by a Java regex, from the specified string column.
    * If the regex did not match, or the specified group did not match, an empty string is returned.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 00074b01ba6d..212e283748fa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -486,6 +486,28 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     )
   }
 
+  test("string / binary octet-length function") {
+    val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
+      .toDF("a", "b", "c", "d", "e", "f")
+    checkAnswer(
+      df.select(octet_length($"a"), octet_length($"b")),
+      Row(3, 4))
+
+    checkAnswer(
+      df.selectExpr("octet_length(a)", "octet_length(b)"),
+      Row(3, 4))
+
+    checkAnswer(
+      df.selectExpr("octet_length(c)", "octet_length(d)", "octet_length(e)"),
+      Row(3, 3, 5)
+    )
+
+    checkAnswer(
+      df.selectExpr("length(f)", "octet_length(f)"),
+      Row(1, 4)
+    )
+  }
+
   test("initcap function") {
     val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z")
     checkAnswer(

From 6df46688e67897bfc4487b45b75c1fba413cc396 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Fri, 10 Sep 2021 20:41:54 +0900
Subject: [PATCH 02/22] Implement R octet_length

---
 R/pkg/NAMESPACE                       |  1 +
 R/pkg/R/functions.R                   | 13 +++++++++++++
 R/pkg/R/generics.R                    |  4 ++++
 R/pkg/tests/fulltests/test_sparkSQL.R |  7 +++++++
 4 files changed, 25 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 7fa80853fd23..28323f8cd8ca 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -364,6 +364,7 @@ exportMethods("%<=>%",
               "not",
               "nth_value",
               "ntile",
+              "octet_length",
               "otherwise",
               "over",
               "overlay",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 62066da10d0d..1781b00b569e 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -3621,6 +3621,19 @@ setMethod("ntile",
             column(jc)
           })
 
+#' @details
+#' \code{octet_length}:  Calculates the byte length for the specified string column.
+#'
+#' @rdname column_string_functions
+#' @aliases octet_length octet_length,Column-method
+#' @note length since 3.3.0
+setMethod("octet_length",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window
 #' partition.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 9ebea3f55e69..c52b14518319 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1230,6 +1230,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") })
 #' @name NULL
 setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
 
+#' @rdname column_string_functions
+#' @name NULL
+setGeneric("octet_length", function(x, ...) { standardGeneric("octet_length") })
+
 #' @rdname column_string_functions
 #' @name NULL
 setGeneric("overlay", function(x, replace, pos, ...) { standardGeneric("overlay") })
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index b97c50074993..0086b49f0e44 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1988,6 +1988,13 @@ test_that("string operators", {
     collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
     ""
   )
+
+  l6 <- list(list("cat"), list("\ud83d\udc08"))
+  df6 <- createDataFrame(l6)
+  expect_equal(
+    collect(select(df6, octet_length(df6$"_1")))[,1],
+    c(3,4)
+  )
 })
 
 test_that("date functions on a DataFrame", {

From eb2456b26e2ea08b8375117617e3e75ea78fbeb7 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Mon, 13 Sep 2021 13:00:01 +0900
Subject: [PATCH 03/22] Implement bit-length functions

---
 R/pkg/NAMESPACE                               |  1 +
 R/pkg/R/functions.R                           | 13 +++++++
 R/pkg/R/generics.R                            |  4 +++
 R/pkg/tests/fulltests/test_sparkSQL.R         |  4 +++
 python/pyspark/sql/functions.py               | 20 +++++++++++
 python/pyspark/sql/functions.pyi              |  1 +
 python/pyspark/sql/tests/test_functions.py    |  6 ++++
 .../org/apache/spark/sql/functions.scala      |  8 +++++
 .../spark/sql/StringFunctionsSuite.scala      | 35 ++++++++++++++++---
 9 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 28323f8cd8ca..8b8946cf51ca 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -245,6 +245,7 @@ exportMethods("%<=>%",
               "bin",
               "bitwise_not",
               "bitwiseNOT",
+              "bit_length",
               "bround",
               "cast",
               "cbrt",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 1781b00b569e..35e83d017be9 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -676,6 +676,19 @@ setMethod("bitwiseNOT",
             bitwise_not(x)
           })
 
+#' @details
+#' \code{bit_length}:  Calculates the bit length for the specified string column.
+#'
+#' @rdname column_string_functions
+#' @aliases bit_length bit_length,Column-method
+#' @note length since 3.3.0
+setMethod("bit_length",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{cbrt}: Computes the cube-root of the given value.
 #'
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c52b14518319..f0767c23bd6e 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -892,6 +892,10 @@ setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") })
 #' @name NULL
 setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })
 
+#' @rdname column_string_functions
+#' @name NULL
+setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") })
+
 #' @rdname column_math_functions
 #' @name NULL
 setGeneric("bround", function(x, ...) { standardGeneric("bround") })
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 0086b49f0e44..c95a3c056aaf 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1995,6 +1995,10 @@ test_that("string operators", {
     collect(select(df6, octet_length(df6$"_1")))[,1],
     c(3,4)
   )
+  expect_equal(
+    collect(select(df6, bit_length(df6$"_1")))[,1],
+    c(24,32)
+  )
 })
 
 test_that("date functions on a DataFrame", {
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 6ee3e4e253db..d6bf872551be 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -3118,6 +3118,26 @@ def octet_length(col):
     return _invoke_function_over_column("octet_length", col)
 
 
+def bit_length(col):
+    """
+    .. versionadded:: 3.3.0
+    Parameters
+    ----------
+    col : :class:`~pyspark.sql.Column` or str
+        Source column or strings
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        Bit length of the col
+    Examples
+    -------
+    >>> from pyspark.sql.functions import bit_length
+    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(bit_length('cat')).collect()
+        [Row(bit_length(cat)=24), Row(bit_length(cat)=32)]
+    """
+    return _invoke_function_over_column("bit_length", col)
+
+
 def translate(srcCol, matching, replace):
     """A function translate any character in the `srcCol` by a character in `matching`.
     The characters in `replace` is corresponding to the characters in `matching`.
diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi
index f29d252c146a..1a0a61efb84f 100644
--- a/python/pyspark/sql/functions.pyi
+++ b/python/pyspark/sql/functions.pyi
@@ -175,6 +175,7 @@ def hex(col: ColumnOrName) -> Column: ...
 def unhex(col: ColumnOrName) -> Column: ...
 def length(col: ColumnOrName) -> Column: ...
 def octet_length(col: ColumnOrName) -> Column: ...
+def bit_length(col: ColumnOrName) -> Column: ...
 def translate(srcCol: ColumnOrName, matching: str, replace: str) -> Column: ...
 def map_from_arrays(col1: ColumnOrName, col2: ColumnOrName) -> Column: ...
 def create_map(*cols: ColumnOrName) -> Column: ...
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index 759553f6a4ac..299d14962c88 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -203,6 +203,12 @@ def test_octet_length_function(self):
         actual = df.select(octet_length('cat')).collect()
         self.assertEqual([Row(3), Row(4)], actual)
 
+    def test_bit_length_function(self):
+        from pyspark.sql.functions import bit_length
+        df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat'])
+        actual = df.select(bit_length('cat')).collect()
+        self.assertEqual([Row(24), Row(32)], actual)
+
     def test_array_contains_function(self):
         from pyspark.sql.functions import array_contains
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index ef921d28ac06..2d12d5f0aacc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2541,6 +2541,14 @@ object functions {
    */
   def base64(e: Column): Column = withExpr { Base64(e.expr) }
 
+  /**
+   * Calculates the bit length for the specified string column.
+   *
+   * @group string_funcs
+   * @since 3.3.0
+   */
+  def bit_length(e: Column): Column = withExpr { BitLength(e.expr) }
+
   /**
    * Concatenates multiple input string columns together into a single string column,
    * using the given separator.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 212e283748fa..353d37dda6c1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -486,28 +486,53 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     )
   }
 
-  test("string / binary octet-length function") {
+  test("octet-length function") {
     val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
       .toDF("a", "b", "c", "d", "e", "f")
+    // string and binary input
     checkAnswer(
       df.select(octet_length($"a"), octet_length($"b")),
       Row(3, 4))
-
+    // string and binary input
     checkAnswer(
       df.selectExpr("octet_length(a)", "octet_length(b)"),
       Row(3, 4))
-
+    // integer, float and double input
     checkAnswer(
       df.selectExpr("octet_length(c)", "octet_length(d)", "octet_length(e)"),
       Row(3, 3, 5)
     )
+    // multi-byte character input
+    checkAnswer(
+      df.selectExpr("octet_length(f)"),
+      Row(4)
+    )
+  }
 
+  test("bit-length function") {
+    val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
+      .toDF("a", "b", "c", "d", "e", "f")
+    // string and binary input
+    checkAnswer(
+      df.select(bit_length($"a"), bit_length($"b")),
+      Row(24, 32))
+    // string and binary input
+    checkAnswer(
+      df.selectExpr("bit_length(a)", "bit_length(b)"),
+      Row(24, 32))
+    // integer, float and double input
     checkAnswer(
-      df.selectExpr("length(f)", "octet_length(f)"),
-      Row(1, 4)
+      df.selectExpr("bit_length(c)", "bit_length(d)", "bit_length(e)"),
+      Row(24, 24, 40)
+    )
+    // multi-byte character input
+    checkAnswer(
+      df.selectExpr("bit_length(f)"),
+      Row(32)
     )
   }
 
+
   test("initcap function") {
     val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z")
     checkAnswer(

From 3e149faeb6bb5dce20f41e1feb821367541dff08 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Mon, 13 Sep 2021 13:40:51 +0900
Subject: [PATCH 04/22] Rearange the order of octet_lengh

---
 R/pkg/NAMESPACE     |  2 +-
 R/pkg/R/functions.R | 52 ++++++++++++++++++++++-----------------------
 R/pkg/R/generics.R  |  8 +++----
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 8b8946cf51ca..686a49e44dfa 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -243,9 +243,9 @@ exportMethods("%<=>%",
               "base64",
               "between",
               "bin",
+              "bit_length",
               "bitwise_not",
               "bitwiseNOT",
-              "bit_length",
               "bround",
               "cast",
               "cbrt",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 35e83d017be9..22bf27c21dbb 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -646,6 +646,19 @@ setMethod("bin",
             column(jc)
           })
 
+#' @details
+#' \code{bit_length}:  Calculates the bit length for the specified string column.
+#'
+#' @rdname column_string_functions
+#' @aliases bit_length bit_length,Column-method
+#' @note length since 3.3.0
+setMethod("bit_length",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{bitwise_not}: Computes bitwise NOT.
 #'
@@ -676,19 +689,6 @@ setMethod("bitwiseNOT",
             bitwise_not(x)
           })
 
-#' @details
-#' \code{bit_length}:  Calculates the bit length for the specified string column.
-#'
-#' @rdname column_string_functions
-#' @aliases bit_length bit_length,Column-method
-#' @note length since 3.3.0
-setMethod("bit_length",
-          signature(x = "Column"),
-          function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc)
-            column(jc)
-          })
-
 #' @details
 #' \code{cbrt}: Computes the cube-root of the given value.
 #'
@@ -1582,6 +1582,19 @@ setMethod("negate",
             column(jc)
           })
 
+#' @details
+#' \code{octet_length}:  Calculates the byte length for the specified string column.
+#'
+#' @rdname column_string_functions
+#' @aliases octet_length octet_length,Column-method
+#' @note length since 3.3.0
+setMethod("octet_length",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{overlay}: Overlay the specified portion of \code{x} with \code{replace},
 #' starting from byte position \code{pos} of \code{src} and proceeding for
@@ -3634,19 +3647,6 @@ setMethod("ntile",
             column(jc)
           })
 
-#' @details
-#' \code{octet_length}:  Calculates the byte length for the specified string column.
-#'
-#' @rdname column_string_functions
-#' @aliases octet_length octet_length,Column-method
-#' @note length since 3.3.0
-setMethod("octet_length",
-          signature(x = "Column"),
-          function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc)
-            column(jc)
-          })
-
 #' @details
 #' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window
 #' partition.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index f0767c23bd6e..1abde65391e9 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -884,6 +884,10 @@ setGeneric("base64", function(x) { standardGeneric("base64") })
 #' @name NULL
 setGeneric("bin", function(x) { standardGeneric("bin") })
 
+#' @rdname column_string_functions
+#' @name NULL
+setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") })
+
 #' @rdname column_nonaggregate_functions
 #' @name NULL
 setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") })
@@ -892,10 +896,6 @@ setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") })
 #' @name NULL
 setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })
 
-#' @rdname column_string_functions
-#' @name NULL
-setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") })
-
 #' @rdname column_math_functions
 #' @name NULL
 setGeneric("bround", function(x, ...) { standardGeneric("bround") })

From ff2f0313dcfc6fbe339d4bb3c67854917ef4a062 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Mon, 13 Sep 2021 16:37:55 +0900
Subject: [PATCH 05/22] Disable scalastyle to use non-ascii charactor on tests

---
 .../scala/org/apache/spark/sql/StringFunctionsSuite.scala   | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 353d37dda6c1..da2f4fcac71f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -487,6 +487,8 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
   }
 
   test("octet-length function") {
+    // scalastyle:off
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
       .toDF("a", "b", "c", "d", "e", "f")
     // string and binary input
@@ -507,9 +509,12 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
       df.selectExpr("octet_length(f)"),
       Row(4)
     )
+    // scalastyle:on
   }
 
   test("bit-length function") {
+    // scalastyle:off
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
       .toDF("a", "b", "c", "d", "e", "f")
     // string and binary input
@@ -530,6 +535,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
       df.selectExpr("bit_length(f)"),
       Row(32)
     )
+    // scalastyle:on
   }
 
 

From 5b9c98b6bada969d1fb844dcb9a2782448ce96fe Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Mon, 13 Sep 2021 17:30:55 +0900
Subject: [PATCH 06/22] Insert newline to python docstrings to keep pylint
 rules

---
 python/pyspark/sql/functions.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index d6bf872551be..19709792130a 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -3112,7 +3112,8 @@ def octet_length(col):
     Examples
     -------
     >>> from pyspark.sql.functions import octet_length
-    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(octet_length('cat')).collect()
+    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) \
+            .select(octet_length('cat')).collect()
         [Row(octet_length(cat)=3), Row(octet_length(cat)=4)]
     """
     return _invoke_function_over_column("octet_length", col)
@@ -3132,7 +3133,8 @@ def bit_length(col):
     Examples
     -------
     >>> from pyspark.sql.functions import bit_length
-    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(bit_length('cat')).collect()
+    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) \
+            .select(bit_length('cat')).collect()
         [Row(bit_length(cat)=24), Row(bit_length(cat)=32)]
     """
     return _invoke_function_over_column("bit_length", col)

From 848f2249d58adb11a44e82772db71181fb82bef8 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Tue, 14 Sep 2021 12:18:35 +0900
Subject: [PATCH 07/22] add space after comma to pass lint-R

---
 R/pkg/tests/fulltests/test_sparkSQL.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index c95a3c056aaf..f0cb2745e1d6 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1992,12 +1992,12 @@ test_that("string operators", {
   l6 <- list(list("cat"), list("\ud83d\udc08"))
   df6 <- createDataFrame(l6)
   expect_equal(
-    collect(select(df6, octet_length(df6$"_1")))[,1],
-    c(3,4)
+    collect(select(df6, octet_length(df6$"_1")))[, 1],
+    c(3, 4)
   )
   expect_equal(
-    collect(select(df6, bit_length(df6$"_1")))[,1],
-    c(24,32)
+    collect(select(df6, bit_length(df6$"_1")))[, 1],
+    c(24, 32)
   )
 })
 

From 4edc918860ef3e1784de026b8bb0c6951ea428a0 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Fri, 10 Sep 2021 18:23:41 +0900
Subject: [PATCH 08/22] Implement scala/python octet_length

---
 python/pyspark/sql/functions.py               | 20 +++++++++++++++++
 python/pyspark/sql/functions.pyi              |  1 +
 python/pyspark/sql/tests/test_functions.py    |  6 +++++
 .../org/apache/spark/sql/functions.scala      |  8 +++++++
 .../spark/sql/StringFunctionsSuite.scala      | 22 +++++++++++++++++++
 5 files changed, 57 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index e418c0d11f8f..6ee3e4e253db 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -3098,6 +3098,26 @@ def length(col):
     return Column(sc._jvm.functions.length(_to_java_column(col)))
 
 
+def octet_length(col):
+    """
+    .. versionadded:: 3.3.0
+    Parameters
+    ----------
+    col : :class:`~pyspark.sql.Column` or str
+        Source column or strings
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        Byte length of the col
+    Examples
+    -------
+    >>> from pyspark.sql.functions import octet_length
+    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(octet_length('cat')).collect()
+        [Row(octet_length(cat)=3), Row(octet_length(cat)=4)]
+    """
+    return _invoke_function_over_column("octet_length", col)
+
+
 def translate(srcCol, matching, replace):
     """A function translate any character in the `srcCol` by a character in `matching`.
     The characters in `replace` is corresponding to the characters in `matching`.
diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi
index 143fa133f4fe..f29d252c146a 100644
--- a/python/pyspark/sql/functions.pyi
+++ b/python/pyspark/sql/functions.pyi
@@ -174,6 +174,7 @@ def bin(col: ColumnOrName) -> Column: ...
 def hex(col: ColumnOrName) -> Column: ...
 def unhex(col: ColumnOrName) -> Column: ...
 def length(col: ColumnOrName) -> Column: ...
+def octet_length(col: ColumnOrName) -> Column: ...
 def translate(srcCol: ColumnOrName, matching: str, replace: str) -> Column: ...
 def map_from_arrays(col1: ColumnOrName, col2: ColumnOrName) -> Column: ...
 def create_map(*cols: ColumnOrName) -> Column: ...
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index 082d61b73242..759553f6a4ac 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -197,6 +197,12 @@ def test_string_functions(self):
                 df.select(getattr(functions, name)("name")).first()[0],
                 df.select(getattr(functions, name)(col("name"))).first()[0])
 
+    def test_octet_length_function(self):
+        from pyspark.sql.functions import octet_length
+        df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat'])
+        actual = df.select(octet_length('cat')).collect()
+        self.assertEqual([Row(3), Row(4)], actual)
+
     def test_array_contains_function(self):
         from pyspark.sql.functions import array_contains
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 781a2dd5649e..ef921d28ac06 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2706,6 +2706,14 @@ object functions {
     StringTrimLeft(e.expr, Literal(trimString))
   }
 
+  /**
+   * Calculates the byte length for the specified string column.
+   *
+   * @group string_funcs
+   * @since 3.3.0
+   */
+  def octet_length(e: Column): Column = withExpr { OctetLength(e.expr) }
+
   /**
    * Extract a specific group matched by a Java regex, from the specified string column.
    * If the regex did not match, or the specified group did not match, an empty string is returned.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 00074b01ba6d..212e283748fa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -486,6 +486,28 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     )
   }
 
+  test("string / binary octet-length function") {
+    val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
+      .toDF("a", "b", "c", "d", "e", "f")
+    checkAnswer(
+      df.select(octet_length($"a"), octet_length($"b")),
+      Row(3, 4))
+
+    checkAnswer(
+      df.selectExpr("octet_length(a)", "octet_length(b)"),
+      Row(3, 4))
+
+    checkAnswer(
+      df.selectExpr("octet_length(c)", "octet_length(d)", "octet_length(e)"),
+      Row(3, 3, 5)
+    )
+
+    checkAnswer(
+      df.selectExpr("length(f)", "octet_length(f)"),
+      Row(1, 4)
+    )
+  }
+
   test("initcap function") {
     val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z")
     checkAnswer(

From 5370a5134cfc8508684920e93ca0ee334a7c113b Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Fri, 10 Sep 2021 20:41:54 +0900
Subject: [PATCH 09/22] Implement R octet_length

---
 R/pkg/NAMESPACE                       |  1 +
 R/pkg/R/functions.R                   | 13 +++++++++++++
 R/pkg/R/generics.R                    |  4 ++++
 R/pkg/tests/fulltests/test_sparkSQL.R |  7 +++++++
 4 files changed, 25 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 7fa80853fd23..28323f8cd8ca 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -364,6 +364,7 @@ exportMethods("%<=>%",
               "not",
               "nth_value",
               "ntile",
+              "octet_length",
               "otherwise",
               "over",
               "overlay",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 62066da10d0d..1781b00b569e 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -3621,6 +3621,19 @@ setMethod("ntile",
             column(jc)
           })
 
+#' @details
+#' \code{octet_length}:  Calculates the byte length for the specified string column.
+#'
+#' @rdname column_string_functions
+#' @aliases octet_length octet_length,Column-method
+#' @note length since 3.3.0
+setMethod("octet_length",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window
 #' partition.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 9ebea3f55e69..c52b14518319 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1230,6 +1230,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") })
 #' @name NULL
 setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
 
+#' @rdname column_string_functions
+#' @name NULL
+setGeneric("octet_length", function(x, ...) { standardGeneric("octet_length") })
+
 #' @rdname column_string_functions
 #' @name NULL
 setGeneric("overlay", function(x, replace, pos, ...) { standardGeneric("overlay") })
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index b97c50074993..0086b49f0e44 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1988,6 +1988,13 @@ test_that("string operators", {
     collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
     ""
   )
+
+  l6 <- list(list("cat"), list("\ud83d\udc08"))
+  df6 <- createDataFrame(l6)
+  expect_equal(
+    collect(select(df6, octet_length(df6$"_1")))[,1],
+    c(3,4)
+  )
 })
 
 test_that("date functions on a DataFrame", {

From 72c939859c538c80b421833e9c1b567c35390df8 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Mon, 13 Sep 2021 13:00:01 +0900
Subject: [PATCH 10/22] Implement bit-length functions

---
 R/pkg/NAMESPACE                               |  1 +
 R/pkg/R/functions.R                           | 13 +++++++
 R/pkg/R/generics.R                            |  4 +++
 R/pkg/tests/fulltests/test_sparkSQL.R         |  4 +++
 python/pyspark/sql/functions.py               | 20 +++++++++++
 python/pyspark/sql/functions.pyi              |  1 +
 python/pyspark/sql/tests/test_functions.py    |  6 ++++
 .../org/apache/spark/sql/functions.scala      |  8 +++++
 .../spark/sql/StringFunctionsSuite.scala      | 35 ++++++++++++++++---
 9 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 28323f8cd8ca..8b8946cf51ca 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -245,6 +245,7 @@ exportMethods("%<=>%",
               "bin",
               "bitwise_not",
               "bitwiseNOT",
+              "bit_length",
               "bround",
               "cast",
               "cbrt",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 1781b00b569e..35e83d017be9 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -676,6 +676,19 @@ setMethod("bitwiseNOT",
             bitwise_not(x)
           })
 
+#' @details
+#' \code{bit_length}:  Calculates the bit length for the specified string column.
+#'
+#' @rdname column_string_functions
+#' @aliases bit_length bit_length,Column-method
+#' @note length since 3.3.0
+setMethod("bit_length",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{cbrt}: Computes the cube-root of the given value.
 #'
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c52b14518319..f0767c23bd6e 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -892,6 +892,10 @@ setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") })
 #' @name NULL
 setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })
 
+#' @rdname column_string_functions
+#' @name NULL
+setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") })
+
 #' @rdname column_math_functions
 #' @name NULL
 setGeneric("bround", function(x, ...) { standardGeneric("bround") })
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 0086b49f0e44..c95a3c056aaf 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1995,6 +1995,10 @@ test_that("string operators", {
     collect(select(df6, octet_length(df6$"_1")))[,1],
     c(3,4)
   )
+  expect_equal(
+    collect(select(df6, bit_length(df6$"_1")))[,1],
+    c(24,32)
+  )
 })
 
 test_that("date functions on a DataFrame", {
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 6ee3e4e253db..d6bf872551be 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -3118,6 +3118,26 @@ def octet_length(col):
     return _invoke_function_over_column("octet_length", col)
 
 
+def bit_length(col):
+    """
+    .. versionadded:: 3.3.0
+    Parameters
+    ----------
+    col : :class:`~pyspark.sql.Column` or str
+        Source column or strings
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        Bit length of the col
+    Examples
+    -------
+    >>> from pyspark.sql.functions import bit_length
+    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(bit_length('cat')).collect()
+        [Row(bit_length(cat)=24), Row(bit_length(cat)=32)]
+    """
+    return _invoke_function_over_column("bit_length", col)
+
+
 def translate(srcCol, matching, replace):
     """A function translate any character in the `srcCol` by a character in `matching`.
     The characters in `replace` is corresponding to the characters in `matching`.
diff --git a/python/pyspark/sql/functions.pyi b/python/pyspark/sql/functions.pyi
index f29d252c146a..1a0a61efb84f 100644
--- a/python/pyspark/sql/functions.pyi
+++ b/python/pyspark/sql/functions.pyi
@@ -175,6 +175,7 @@ def hex(col: ColumnOrName) -> Column: ...
 def unhex(col: ColumnOrName) -> Column: ...
 def length(col: ColumnOrName) -> Column: ...
 def octet_length(col: ColumnOrName) -> Column: ...
+def bit_length(col: ColumnOrName) -> Column: ...
 def translate(srcCol: ColumnOrName, matching: str, replace: str) -> Column: ...
 def map_from_arrays(col1: ColumnOrName, col2: ColumnOrName) -> Column: ...
 def create_map(*cols: ColumnOrName) -> Column: ...
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index 759553f6a4ac..299d14962c88 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -203,6 +203,12 @@ def test_octet_length_function(self):
         actual = df.select(octet_length('cat')).collect()
         self.assertEqual([Row(3), Row(4)], actual)
 
+    def test_bit_length_function(self):
+        from pyspark.sql.functions import bit_length
+        df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat'])
+        actual = df.select(bit_length('cat')).collect()
+        self.assertEqual([Row(24), Row(32)], actual)
+
     def test_array_contains_function(self):
         from pyspark.sql.functions import array_contains
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index ef921d28ac06..2d12d5f0aacc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2541,6 +2541,14 @@ object functions {
    */
   def base64(e: Column): Column = withExpr { Base64(e.expr) }
 
+  /**
+   * Calculates the bit length for the specified string column.
+   *
+   * @group string_funcs
+   * @since 3.3.0
+   */
+  def bit_length(e: Column): Column = withExpr { BitLength(e.expr) }
+
   /**
    * Concatenates multiple input string columns together into a single string column,
    * using the given separator.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 212e283748fa..353d37dda6c1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -486,28 +486,53 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     )
   }
 
-  test("string / binary octet-length function") {
+  test("octet-length function") {
     val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
       .toDF("a", "b", "c", "d", "e", "f")
+    // string and binary input
     checkAnswer(
       df.select(octet_length($"a"), octet_length($"b")),
       Row(3, 4))
-
+    // string and binary input
     checkAnswer(
       df.selectExpr("octet_length(a)", "octet_length(b)"),
       Row(3, 4))
-
+    // integer, float and double input
     checkAnswer(
       df.selectExpr("octet_length(c)", "octet_length(d)", "octet_length(e)"),
       Row(3, 3, 5)
     )
+    // multi-byte character input
+    checkAnswer(
+      df.selectExpr("octet_length(f)"),
+      Row(4)
+    )
+  }
 
+  test("bit-length function") {
+    val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
+      .toDF("a", "b", "c", "d", "e", "f")
+    // string and binary input
+    checkAnswer(
+      df.select(bit_length($"a"), bit_length($"b")),
+      Row(24, 32))
+    // string and binary input
+    checkAnswer(
+      df.selectExpr("bit_length(a)", "bit_length(b)"),
+      Row(24, 32))
+    // integer, float and double input
     checkAnswer(
-      df.selectExpr("length(f)", "octet_length(f)"),
-      Row(1, 4)
+      df.selectExpr("bit_length(c)", "bit_length(d)", "bit_length(e)"),
+      Row(24, 24, 40)
+    )
+    // multi-byte character input
+    checkAnswer(
+      df.selectExpr("bit_length(f)"),
+      Row(32)
     )
   }
 
+
   test("initcap function") {
     val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z")
     checkAnswer(

From 761f8b6f3d4536d6256d295fbdf7b0a949b00647 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Mon, 13 Sep 2021 13:40:51 +0900
Subject: [PATCH 11/22] Rearange the order of octet_lengh

---
 R/pkg/NAMESPACE     |  2 +-
 R/pkg/R/functions.R | 52 ++++++++++++++++++++++-----------------------
 R/pkg/R/generics.R  |  8 +++----
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 8b8946cf51ca..686a49e44dfa 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -243,9 +243,9 @@ exportMethods("%<=>%",
               "base64",
               "between",
               "bin",
+              "bit_length",
               "bitwise_not",
               "bitwiseNOT",
-              "bit_length",
               "bround",
               "cast",
               "cbrt",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 35e83d017be9..22bf27c21dbb 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -646,6 +646,19 @@ setMethod("bin",
             column(jc)
           })
 
+#' @details
+#' \code{bit_length}:  Calculates the bit length for the specified string column.
+#'
+#' @rdname column_string_functions
+#' @aliases bit_length bit_length,Column-method
+#' @note length since 3.3.0
+setMethod("bit_length",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{bitwise_not}: Computes bitwise NOT.
 #'
@@ -676,19 +689,6 @@ setMethod("bitwiseNOT",
             bitwise_not(x)
           })
 
-#' @details
-#' \code{bit_length}:  Calculates the bit length for the specified string column.
-#'
-#' @rdname column_string_functions
-#' @aliases bit_length bit_length,Column-method
-#' @note length since 3.3.0
-setMethod("bit_length",
-          signature(x = "Column"),
-          function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "bit_length", x@jc)
-            column(jc)
-          })
-
 #' @details
 #' \code{cbrt}: Computes the cube-root of the given value.
 #'
@@ -1582,6 +1582,19 @@ setMethod("negate",
             column(jc)
           })
 
+#' @details
+#' \code{octet_length}:  Calculates the byte length for the specified string column.
+#'
+#' @rdname column_string_functions
+#' @aliases octet_length octet_length,Column-method
+#' @note length since 3.3.0
+setMethod("octet_length",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{overlay}: Overlay the specified portion of \code{x} with \code{replace},
 #' starting from byte position \code{pos} of \code{src} and proceeding for
@@ -3634,19 +3647,6 @@ setMethod("ntile",
             column(jc)
           })
 
-#' @details
-#' \code{octet_length}:  Calculates the byte length for the specified string column.
-#'
-#' @rdname column_string_functions
-#' @aliases octet_length octet_length,Column-method
-#' @note length since 3.3.0
-setMethod("octet_length",
-          signature(x = "Column"),
-          function(x) {
-            jc <- callJStatic("org.apache.spark.sql.functions", "octet_length", x@jc)
-            column(jc)
-          })
-
 #' @details
 #' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window
 #' partition.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index f0767c23bd6e..1abde65391e9 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -884,6 +884,10 @@ setGeneric("base64", function(x) { standardGeneric("base64") })
 #' @name NULL
 setGeneric("bin", function(x) { standardGeneric("bin") })
 
+#' @rdname column_string_functions
+#' @name NULL
+setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") })
+
 #' @rdname column_nonaggregate_functions
 #' @name NULL
 setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") })
@@ -892,10 +896,6 @@ setGeneric("bitwise_not", function(x) { standardGeneric("bitwise_not") })
 #' @name NULL
 setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })
 
-#' @rdname column_string_functions
-#' @name NULL
-setGeneric("bit_length", function(x, ...) { standardGeneric("bit_length") })
-
 #' @rdname column_math_functions
 #' @name NULL
 setGeneric("bround", function(x, ...) { standardGeneric("bround") })

From f7ace95d3989941e12026e854850a4c115128829 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Mon, 13 Sep 2021 16:37:55 +0900
Subject: [PATCH 12/22] Disable scalastyle to use non-ascii charactor on tests

---
 .../scala/org/apache/spark/sql/StringFunctionsSuite.scala   | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 353d37dda6c1..da2f4fcac71f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -487,6 +487,8 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
   }
 
   test("octet-length function") {
+    // scalastyle:off
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
       .toDF("a", "b", "c", "d", "e", "f")
     // string and binary input
@@ -507,9 +509,12 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
       df.selectExpr("octet_length(f)"),
       Row(4)
     )
+    // scalastyle:on
   }
 
   test("bit-length function") {
+    // scalastyle:off
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
       .toDF("a", "b", "c", "d", "e", "f")
     // string and binary input
@@ -530,6 +535,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
       df.selectExpr("bit_length(f)"),
       Row(32)
     )
+    // scalastyle:on
   }
 
 

From 2ed29f5a6981fd51568ce18eace234c2c4ea41f8 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Mon, 13 Sep 2021 17:30:55 +0900
Subject: [PATCH 13/22] Insert newline to python docstrings to keep pylint
 rules

---
 python/pyspark/sql/functions.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index d6bf872551be..19709792130a 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -3112,7 +3112,8 @@ def octet_length(col):
     Examples
     -------
     >>> from pyspark.sql.functions import octet_length
-    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(octet_length('cat')).collect()
+    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) \
+            .select(octet_length('cat')).collect()
         [Row(octet_length(cat)=3), Row(octet_length(cat)=4)]
     """
     return _invoke_function_over_column("octet_length", col)
@@ -3132,7 +3133,8 @@ def bit_length(col):
     Examples
     -------
     >>> from pyspark.sql.functions import bit_length
-    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']).select(bit_length('cat')).collect()
+    >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) \
+            .select(bit_length('cat')).collect()
         [Row(bit_length(cat)=24), Row(bit_length(cat)=32)]
     """
     return _invoke_function_over_column("bit_length", col)

From 08580015c405b3e7e3dbbb3dda504d540ac33b22 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Tue, 14 Sep 2021 12:18:35 +0900
Subject: [PATCH 14/22] add space after comma to pass lint-R

---
 R/pkg/tests/fulltests/test_sparkSQL.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index c95a3c056aaf..f0cb2745e1d6 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1992,12 +1992,12 @@ test_that("string operators", {
   l6 <- list(list("cat"), list("\ud83d\udc08"))
   df6 <- createDataFrame(l6)
   expect_equal(
-    collect(select(df6, octet_length(df6$"_1")))[,1],
-    c(3,4)
+    collect(select(df6, octet_length(df6$"_1")))[, 1],
+    c(3, 4)
   )
   expect_equal(
-    collect(select(df6, bit_length(df6$"_1")))[,1],
-    c(24,32)
+    collect(select(df6, bit_length(df6$"_1")))[, 1],
+    c(24, 32)
   )
 })
 

From 327d7d561f5c1bb330484f0eb224548c206a2f8d Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Tue, 14 Sep 2021 18:42:12 +0900
Subject: [PATCH 15/22] Delete unnecessary line in the scala test.

---
 .../test/scala/org/apache/spark/sql/StringFunctionsSuite.scala   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index da2f4fcac71f..e45cc8173a1c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -538,7 +538,6 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     // scalastyle:on
   }
 
-
   test("initcap function") {
     val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z")
     checkAnswer(

From 387a92f3223fa9080d25ca4f1c3a2fc2303c7f08 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Tue, 14 Sep 2021 18:59:20 +0900
Subject: [PATCH 16/22] Add a short description of the test

---
 python/pyspark/sql/tests/test_functions.py                    | 2 ++
 .../scala/org/apache/spark/sql/StringFunctionsSuite.scala     | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index 299d14962c88..f8e42303f93a 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -198,12 +198,14 @@ def test_string_functions(self):
                 df.select(getattr(functions, name)(col("name"))).first()[0])
 
     def test_octet_length_function(self):
+        # SPARK-36751: add octet/bit length api for python
         from pyspark.sql.functions import octet_length
         df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat'])
         actual = df.select(octet_length('cat')).collect()
         self.assertEqual([Row(3), Row(4)], actual)
 
     def test_bit_length_function(self):
+        # SPARK-36751: add octet/bit length api for python
         from pyspark.sql.functions import bit_length
         df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat'])
         actual = df.select(bit_length('cat')).collect()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index e45cc8173a1c..3efcc7f694e0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -486,7 +486,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     )
   }
 
-  test("octet-length function") {
+  test("SPARK-36751: add octet/bit length api for scala") {
     // scalastyle:off
     // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
@@ -512,7 +512,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     // scalastyle:on
   }
 
-  test("bit-length function") {
+  test("SPARK-36751: add octet/bit length api for scala") {
     // scalastyle:off
     // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))

From 70a1217f37b9867190f97a4ade667eb76b89e940 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Tue, 14 Sep 2021 19:11:01 +0900
Subject: [PATCH 17/22] Add a short description to docstrings

---
 python/pyspark/sql/functions.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 19709792130a..a1eae3ead81f 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -3100,6 +3100,8 @@ def length(col):
 
 def octet_length(col):
     """
+    Calculates the byte length for the specified string column.
+
     .. versionadded:: 3.3.0
     Parameters
     ----------
@@ -3121,6 +3123,8 @@ def octet_length(col):
 
 def bit_length(col):
     """
+    Calculates the bit length for the specified string column.
+
     .. versionadded:: 3.3.0
     Parameters
     ----------

From 3a0733747c3db00ae4e5fcfd3432be3bf8ae9325 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Tue, 14 Sep 2021 19:15:29 +0900
Subject: [PATCH 18/22] Move import to the top of the file

---
 python/pyspark/sql/tests/test_functions.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index f8e42303f93a..c69e96bddb9e 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -23,7 +23,7 @@
 from pyspark.sql import Row, Window
 from pyspark.sql.functions import udf, input_file_name, col, percentile_approx, \
     lit, assert_true, sum_distinct, sumDistinct, shiftleft, shiftLeft, shiftRight, \
-    shiftright, shiftrightunsigned, shiftRightUnsigned
+    shiftright, shiftrightunsigned, shiftRightUnsigned, octet_length, bit_length
 from pyspark.testing.sqlutils import ReusedSQLTestCase
 
 
@@ -199,14 +199,12 @@ def test_string_functions(self):
 
     def test_octet_length_function(self):
         # SPARK-36751: add octet/bit length api for python
-        from pyspark.sql.functions import octet_length
         df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat'])
         actual = df.select(octet_length('cat')).collect()
         self.assertEqual([Row(3), Row(4)], actual)
 
     def test_bit_length_function(self):
         # SPARK-36751: add octet/bit length api for python
-        from pyspark.sql.functions import bit_length
         df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat'])
         actual = df.select(bit_length('cat')).collect()
         self.assertEqual([Row(24), Row(32)], actual)

From 545fbe2a41cc209171b153111bd58e535ed200ec Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Tue, 14 Sep 2021 21:08:12 +0900
Subject: [PATCH 19/22] differentiate octet_length and bit_length test name

---
 .../scala/org/apache/spark/sql/StringFunctionsSuite.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 3efcc7f694e0..30a6600c3176 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -486,7 +486,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     )
   }
 
-  test("SPARK-36751: add octet/bit length api for scala") {
+  test("SPARK-36751: add octet length api for scala") {
     // scalastyle:off
     // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))
@@ -512,7 +512,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     // scalastyle:on
   }
 
-  test("SPARK-36751: add octet/bit length api for scala") {
+  test("SPARK-36751: add bit length api for scala") {
     // scalastyle:off
     // non ascii characters are not allowed in the code, so we disable the scalastyle here.
     val df = Seq(("123", Array[Byte](1, 2, 3, 4), 123, 2.0f, 3.015, "\ud83d\udc08"))

From afa1700ee3f4e3a0ba0f0a685c091e45f1631a09 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Wed, 15 Sep 2021 10:06:29 +0900
Subject: [PATCH 20/22] Formatting decstrings

---
 R/pkg/R/functions.R             | 4 ++--
 python/pyspark/sql/functions.py | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 22bf27c21dbb..f0768c7e1d83 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -647,7 +647,7 @@ setMethod("bin",
           })
 
 #' @details
-#' \code{bit_length}:  Calculates the bit length for the specified string column.
+#' \code{bit_length}: Calculates the bit length for the specified string column.
 #'
 #' @rdname column_string_functions
 #' @aliases bit_length bit_length,Column-method
@@ -1583,7 +1583,7 @@ setMethod("negate",
           })
 
 #' @details
-#' \code{octet_length}:  Calculates the byte length for the specified string column.
+#' \code{octet_length}: Calculates the byte length for the specified string column.
 #'
 #' @rdname column_string_functions
 #' @aliases octet_length octet_length,Column-method
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index a1eae3ead81f..105727e999bd 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -3103,14 +3103,17 @@ def octet_length(col):
     Calculates the byte length for the specified string column.
 
     .. versionadded:: 3.3.0
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
         Source column or strings
+
     Returns
     -------
     :class:`~pyspark.sql.Column`
         Byte length of the col
+
     Examples
     -------
     >>> from pyspark.sql.functions import octet_length
@@ -3126,14 +3129,17 @@ def bit_length(col):
     Calculates the bit length for the specified string column.
 
     .. versionadded:: 3.3.0
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
         Source column or strings
+
     Returns
     -------
     :class:`~pyspark.sql.Column`
         Bit length of the col
+
     Examples
     -------
     >>> from pyspark.sql.functions import bit_length

From 127da47343447c1e3e48c8337187536070dec689 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Wed, 15 Sep 2021 10:35:00 +0900
Subject: [PATCH 21/22] differentiate test comments

---
 python/pyspark/sql/tests/test_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index c69e96bddb9e..00a2660492a0 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -198,13 +198,13 @@ def test_string_functions(self):
                 df.select(getattr(functions, name)(col("name"))).first()[0])
 
     def test_octet_length_function(self):
-        # SPARK-36751: add octet/bit length api for python
+        # SPARK-36751: add octet length api for python
         df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat'])
         actual = df.select(octet_length('cat')).collect()
         self.assertEqual([Row(3), Row(4)], actual)
 
     def test_bit_length_function(self):
-        # SPARK-36751: add octet/bit length api for python
+        # SPARK-36751: add bit length api for python
         df = self.spark.createDataFrame([('cat',), ('\U0001F408',)], ['cat'])
         actual = df.select(bit_length('cat')).collect()
         self.assertEqual([Row(24), Row(32)], actual)

From 41656f43bfa41431035bf9023ee6035f856570b3 Mon Sep 17 00:00:00 2001
From: Leona Yoda <yodal@oss.nttdata.com>
Date: Wed, 15 Sep 2021 10:42:18 +0900
Subject: [PATCH 22/22] Add bit/octet length to the api document

---
 python/docs/source/reference/pyspark.sql.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/docs/source/reference/pyspark.sql.rst b/python/docs/source/reference/pyspark.sql.rst
index 605a1504e498..cc9c4295567c 100644
--- a/python/docs/source/reference/pyspark.sql.rst
+++ b/python/docs/source/reference/pyspark.sql.rst
@@ -367,6 +367,7 @@ Functions
     avg
     base64
     bin
+    bit_length
     bitwise_not
     bitwiseNOT
     broadcast
@@ -483,6 +484,7 @@ Functions
     next_day
     nth_value
     ntile
+    octet_length
     overlay
     pandas_udf
     percent_rank