From 5f0ebddc36bb4f63aca162d3f0c23d56860a55b6 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 25 Mar 2017 16:15:59 +0900 Subject: [PATCH 1/4] Match Scala/Python/R changes --- R/pkg/R/functions.R | 6 +++--- python/pyspark/sql/functions.py | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 2cff3ac08c3a..449476dec533 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2632,8 +2632,8 @@ setMethod("date_sub", signature(y = "Column", x = "numeric"), #' format_number #' -#' Formats numeric column y to a format like '#,###,###.##', rounded to x decimal places, -#' and returns the result as a string column. +#' Formats numeric column y to a format like '#,###,###.##', rounded to x decimal places +#' with HALF_EVEN round mode, and returns the result as a string column. #' #' If x is 0, the result has no decimal point or fractional part. #' If x < 0, the result will be null. @@ -3548,7 +3548,7 @@ setMethod("row_number", #' array_contains #' -#' Returns true if the array contain the value. +#' Returns null if the array is null, true if the array contains the value, and false otherwise. #' #' @param x A Column #' @param value A value to be checked if contained in the column diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index f9121e60f35b..1697d145e9b1 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1327,8 +1327,8 @@ def encode(col, charset): @since(1.5) def format_number(col, d): """ - Formats the number X to a format like '#,--#,--#.--', rounded to d decimal places, - and returns the result as a string. + Formats the number X to a format like '#,--#,--#.--', rounded to d decimal places + with HALF_EVEN round mode, and returns the result as a string. :param col: the column name of the numeric value to be formatted :param d: the N decimal places @@ -1675,8 +1675,8 @@ def array(*cols): @since(1.5) def array_contains(col, value): """ - Collection function: returns True if the array contains the given value. The collection - elements and value must be of the same type. + Collection function: returns null if the array is null, true if the array contains the + given value, and false otherwise. :param col: name of column containing array :param value: value to check for in array @@ -1684,6 +1684,9 @@ def array_contains(col, value): >>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data']) >>> df.select(array_contains(df.data, "a")).collect() [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)] + >>> df = spark.createDataFrame([(["1", "2", "3"],), ([],)], ['data']) + >>> df.select(array_contains(df.data, 1)).collect() + [Row(array_contains(data, 1)=True), Row(array_contains(data, 1)=False)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.array_contains(_to_java_column(col), value)) From 33f132808562aaa78446c05f1fb3462603346935 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 27 Mar 2017 08:12:28 +0900 Subject: [PATCH 2/4] Remove potentially confusing doctest in array_contains in Python --- python/pyspark/sql/functions.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 1697d145e9b1..843ae3816f06 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1684,9 +1684,6 @@ def array_contains(col, value): >>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data']) >>> df.select(array_contains(df.data, "a")).collect() [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)] - >>> df = spark.createDataFrame([(["1", "2", "3"],), ([],)], ['data']) - >>> df.select(array_contains(df.data, 1)).collect() - [Row(array_contains(data, 1)=True), Row(array_contains(data, 1)=False)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.array_contains(_to_java_column(col), value)) From d21a9bf760d5e03d844b26c44132e3c5f9953af6 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 27 Mar 2017 08:26:59 +0900 Subject: [PATCH 3/4] Add the test in python/pyspark/sql/tests.py instead --- python/pyspark/sql/tests.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index b93b7ed19210..265238041c5d 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1129,6 +1129,14 @@ def test_rand_functions(self): rndn2 = df.select('key', functions.randn(0)).collect() self.assertEqual(sorted(rndn1), sorted(rndn2)) + def test_array_contains_function(self): + from pyspark.sql.functions import array_contains + + df = self.spark.createDataFrame([(["1", "2", "3"],), ([],)], ['data']) + b = df.select(array_contains(df.data, 1).alias('bool')).collect() + # The value argument can be implicitly castable to the element's type of the array. + self.assertEqual([Row(bool=True), Row(bool=False)], b) + def test_between_function(self): df = self.sc.parallelize([ Row(a=1, b=2, c=3), From d05aba5b70ad22fd0e5661168dc6deceff51a13e Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 27 Mar 2017 08:28:50 +0900 Subject: [PATCH 4/4] Rename variables --- python/pyspark/sql/tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 265238041c5d..db41b4edb6dd 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1133,9 +1133,9 @@ def test_array_contains_function(self): from pyspark.sql.functions import array_contains df = self.spark.createDataFrame([(["1", "2", "3"],), ([],)], ['data']) - b = df.select(array_contains(df.data, 1).alias('bool')).collect() + actual = df.select(array_contains(df.data, 1).alias('b')).collect() # The value argument can be implicitly castable to the element's type of the array. - self.assertEqual([Row(bool=True), Row(bool=False)], b) + self.assertEqual([Row(b=True), Row(b=False)], actual) def test_between_function(self): df = self.sc.parallelize([