Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 185 additions & 13 deletions python/pyspark/sql/functions/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -12963,7 +12963,7 @@ def array_prepend(col: "ColumnOrName", value: Any) -> Column:
@_try_remote_functions
def array_remove(col: "ColumnOrName", element: Any) -> Column:
"""
Collection function: Remove all elements that equal to element from the given array.
Array function: Remove all elements that equal to element from the given array.

.. versionadded:: 2.4.0

Expand All @@ -12980,21 +12980,77 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column:
Returns
-------
:class:`~pyspark.sql.Column`
an array excluding given value.
A new column that is an array excluding the given value from the input column.

Examples
--------
>>> df = spark.createDataFrame([([1, 2, 3, 1, 1],), ([],)], ['data'])
>>> df.select(array_remove(df.data, 1)).collect()
[Row(array_remove(data, 1)=[2, 3]), Row(array_remove(data, 1)=[])]
Example 1: Removing a specific value from a simple array

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 2, 3, 1, 1],)], ['data'])
>>> df.select(sf.array_remove(df.data, 1)).show()
+---------------------+
|array_remove(data, 1)|
+---------------------+
| [2, 3]|
+---------------------+

Example 2: Removing a specific value from multiple arrays

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 2, 3, 1, 1],), ([4, 5, 5, 4],)], ['data'])
>>> df.select(sf.array_remove(df.data, 5)).show()
+---------------------+
|array_remove(data, 5)|
+---------------------+
| [1, 2, 3, 1, 1]|
| [4, 4]|
+---------------------+

Example 3: Removing a value that does not exist in the array

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 2, 3],)], ['data'])
>>> df.select(sf.array_remove(df.data, 4)).show()
+---------------------+
|array_remove(data, 4)|
+---------------------+
| [1, 2, 3]|
+---------------------+

Example 4: Removing a value from an array with all identical values

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 1, 1],)], ['data'])
>>> df.select(sf.array_remove(df.data, 1)).show()
+---------------------+
|array_remove(data, 1)|
+---------------------+
| []|
+---------------------+

Example 5: Removing a value from an empty array

>>> from pyspark.sql import functions as sf
>>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField
>>> schema = StructType([
... StructField("data", ArrayType(IntegerType()), True)
... ])
>>> df = spark.createDataFrame([([],)], schema)
>>> df.select(sf.array_remove(df.data, 1)).show()
+---------------------+
|array_remove(data, 1)|
+---------------------+
| []|
+---------------------+
"""
return _invoke_function("array_remove", _to_java_column(col), element)


@_try_remote_functions
def array_distinct(col: "ColumnOrName") -> Column:
"""
Collection function: removes duplicate values from the array.
Array function: removes duplicate values from the array.

.. versionadded:: 2.4.0

Expand All @@ -13009,13 +13065,69 @@ def array_distinct(col: "ColumnOrName") -> Column:
Returns
-------
:class:`~pyspark.sql.Column`
an array of unique values.
A new column that is an array of unique values from the input column.

Examples
--------
Example 1: Removing duplicate values from a simple array

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 2, 3, 2],)], ['data'])
>>> df.select(sf.array_distinct(df.data)).show()
+--------------------+
|array_distinct(data)|
+--------------------+
| [1, 2, 3]|
+--------------------+

Example 2: Removing duplicate values from multiple arrays

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 2, 3, 2],), ([4, 5, 5, 4],)], ['data'])
>>> df.select(array_distinct(df.data)).collect()
[Row(array_distinct(data)=[1, 2, 3]), Row(array_distinct(data)=[4, 5])]
>>> df.select(sf.array_distinct(df.data)).show()
+--------------------+
|array_distinct(data)|
+--------------------+
| [1, 2, 3]|
| [4, 5]|
+--------------------+

Example 3: Removing duplicate values from an array with all identical values

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 1, 1],)], ['data'])
>>> df.select(sf.array_distinct(df.data)).show()
+--------------------+
|array_distinct(data)|
+--------------------+
| [1]|
+--------------------+

Example 4: Removing duplicate values from an array with no duplicate values

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 2, 3],)], ['data'])
>>> df.select(sf.array_distinct(df.data)).show()
+--------------------+
|array_distinct(data)|
+--------------------+
| [1, 2, 3]|
+--------------------+

Example 5: Removing duplicate values from an empty array

>>> from pyspark.sql import functions as sf
>>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField
>>> schema = StructType([
... StructField("data", ArrayType(IntegerType()), True)
... ])
>>> df = spark.createDataFrame([([],)], schema)
>>> df.select(sf.array_distinct(df.data)).show()
+--------------------+
|array_distinct(data)|
+--------------------+
| []|
+--------------------+
"""
return _invoke_function_over_columns("array_distinct", col)

Expand Down Expand Up @@ -13399,7 +13511,7 @@ def array_except(col1: "ColumnOrName", col2: "ColumnOrName") -> Column:
@_try_remote_functions
def array_compact(col: "ColumnOrName") -> Column:
"""
Collection function: removes null values from the array.
Array function: removes null values from the array.

.. versionadded:: 3.4.0

Expand All @@ -13411,17 +13523,77 @@ def array_compact(col: "ColumnOrName") -> Column:
Returns
-------
:class:`~pyspark.sql.Column`
an array by excluding the null values.
A new column that is an array excluding the null values from the input column.

Notes
-----
Supports Spark Connect.

Examples
--------
Example 1: Removing null values from a simple array

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, None, 2, 3],)], ['data'])
>>> df.select(sf.array_compact(df.data)).show()
+-------------------+
|array_compact(data)|
+-------------------+
| [1, 2, 3]|
+-------------------+

Example 2: Removing null values from multiple arrays

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, None, 2, 3],), ([4, 5, None, 4],)], ['data'])
>>> df.select(array_compact(df.data)).collect()
[Row(array_compact(data)=[1, 2, 3]), Row(array_compact(data)=[4, 5, 4])]
>>> df.select(sf.array_compact(df.data)).show()
+-------------------+
|array_compact(data)|
+-------------------+
| [1, 2, 3]|
| [4, 5, 4]|
+-------------------+

Example 3: Removing null values from an array with all null values

>>> from pyspark.sql import functions as sf
>>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType
>>> schema = StructType([
... StructField("data", ArrayType(StringType()), True)
... ])
>>> df = spark.createDataFrame([([None, None, None],)], schema)
>>> df.select(sf.array_compact(df.data)).show()
+-------------------+
|array_compact(data)|
+-------------------+
| []|
+-------------------+

Example 4: Removing null values from an array with no null values

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 2, 3],)], ['data'])
>>> df.select(sf.array_compact(df.data)).show()
+-------------------+
|array_compact(data)|
+-------------------+
| [1, 2, 3]|
+-------------------+

Example 5: Removing null values from an empty array

>>> from pyspark.sql import functions as sf
>>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType
>>> schema = StructType([
... StructField("data", ArrayType(StringType()), True)
... ])
>>> df = spark.createDataFrame([([],)], schema)
>>> df.select(sf.array_compact(df.data)).show()
+-------------------+
|array_compact(data)|
+-------------------+
| []|
+-------------------+
"""
return _invoke_function_over_columns("array_compact", col)

Expand Down