Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 191 additions & 26 deletions python/pyspark/sql/functions/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -12875,9 +12875,8 @@ def get(col: "ColumnOrName", index: Union["ColumnOrName", int]) -> Column:
@_try_remote_functions
def array_prepend(col: "ColumnOrName", value: Any) -> Column:
"""
Collection function: Returns an array containing element as
well as all elements from array. The new element is positioned
at the beginning of the array.
Array function: Returns an array containing the given element as
the first element and the rest of the elements from the original array.

.. versionadded:: 3.5.0

Expand All @@ -12891,13 +12890,72 @@ def array_prepend(col: "ColumnOrName", value: Any) -> Column:
Returns
-------
:class:`~pyspark.sql.Column`
an array excluding given value.
an array with the given value prepended.

Examples
--------
>>> df = spark.createDataFrame([([2, 3, 4],), ([],)], ['data'])
>>> df.select(array_prepend(df.data, 1)).collect()
[Row(array_prepend(data, 1)=[1, 2, 3, 4]), Row(array_prepend(data, 1)=[1])]
Example 1: Prepending a column value to an array column

>>> from pyspark.sql import Row, functions as sf
>>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2="c")])
>>> df.select(sf.array_prepend(df.c1, df.c2)).show()
+---------------------+
|array_prepend(c1, c2)|
+---------------------+
| [c, b, a, c]|
+---------------------+

Example 2: Prepending a numeric value to an array column

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 2, 3],)], ['data'])
>>> df.select(sf.array_prepend(df.data, 4)).show()
+----------------------+
|array_prepend(data, 4)|
+----------------------+
| [4, 1, 2, 3]|
+----------------------+

Example 3: Prepending a null value to an array column

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 2, 3],)], ['data'])
>>> df.select(sf.array_prepend(df.data, None)).show()
+-------------------------+
|array_prepend(data, NULL)|
+-------------------------+
| [NULL, 1, 2, 3]|
+-------------------------+

Example 4: Prepending a value to a NULL array column

>>> from pyspark.sql import functions as sf
>>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField
>>> schema = StructType([
... StructField("data", ArrayType(IntegerType()), True)
... ])
>>> df = spark.createDataFrame([(None,)], schema=schema)
>>> df.select(sf.array_prepend(df.data, 4)).show()
+----------------------+
|array_prepend(data, 4)|
+----------------------+
| NULL|
+----------------------+

Example 5: Prepending a value to an empty array

>>> from pyspark.sql import functions as sf
>>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField
>>> schema = StructType([
... StructField("data", ArrayType(IntegerType()), True)
... ])
>>> df = spark.createDataFrame([([],)], schema=schema)
>>> df.select(sf.array_prepend(df.data, 1)).show()
+----------------------+
|array_prepend(data, 1)|
+----------------------+
| [1]|
+----------------------+
"""
return _invoke_function_over_columns("array_prepend", col, lit(value))

Expand Down Expand Up @@ -12965,7 +13023,7 @@ def array_distinct(col: "ColumnOrName") -> Column:
@_try_remote_functions
def array_insert(arr: "ColumnOrName", pos: Union["ColumnOrName", int], value: Any) -> Column:
"""
Collection function: adds an item into a given array at a specified array index.
Array function: Inserts an item into a given array at a specified array index.
Array indices start at 1, or start from the end if index is negative.
Index above array size appends the array, or prepends the array if index is negative,
with 'null' elements.
Expand Down Expand Up @@ -12993,14 +13051,66 @@ def array_insert(arr: "ColumnOrName", pos: Union["ColumnOrName", int], value: An

Examples
--------
>>> df = spark.createDataFrame(
... [(['a', 'b', 'c'], 2, 'd'), (['c', 'b', 'a'], -2, 'd')],
... ['data', 'pos', 'val']
... )
>>> df.select(array_insert(df.data, df.pos.cast('integer'), df.val).alias('data')).collect()
[Row(data=['a', 'd', 'b', 'c']), Row(data=['c', 'b', 'd', 'a'])]
>>> df.select(array_insert(df.data, 5, 'hello').alias('data')).collect()
[Row(data=['a', 'b', 'c', None, 'hello']), Row(data=['c', 'b', 'a', None, 'hello'])]
Example 1: Inserting a value at a specific position

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data'])
>>> df.select(sf.array_insert(df.data, 2, 'd')).show()
+------------------------+
|array_insert(data, 2, d)|
+------------------------+
| [a, d, b, c]|
+------------------------+

Example 2: Inserting a value at a negative position

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data'])
>>> df.select(sf.array_insert(df.data, -2, 'd')).show()
+-------------------------+
|array_insert(data, -2, d)|
+-------------------------+
| [a, b, d, c]|
+-------------------------+

Example 3: Inserting a value at a position greater than the array size

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data'])
>>> df.select(sf.array_insert(df.data, 5, 'e')).show()
+------------------------+
|array_insert(data, 5, e)|
+------------------------+
| [a, b, c, NULL, e]|
+------------------------+

Example 4: Inserting a NULL value

>>> from pyspark.sql import functions as sf
>>> from pyspark.sql.types import StringType
>>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data'])
>>> df.select(sf.array_insert(df.data, 2, sf.lit(None).cast(StringType()))
... .alias("result")).show()
+---------------+
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image

It seems that the headers of the results from the connect and sql tests are inconsistent, try giving it an alias.

| result|
+---------------+
|[a, NULL, b, c]|
+---------------+

Example 5: Inserting a value into a NULL array

>>> from pyspark.sql import functions as sf
>>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField
>>> schema = StructType([
... StructField("data", ArrayType(IntegerType()), True)
... ])
>>> df = spark.createDataFrame([(None,)], schema=schema)
>>> df.select(sf.array_insert(df.data, 1, 5)).show()
+------------------------+
|array_insert(data, 1, 5)|
+------------------------+
| NULL|
+------------------------+
"""
pos = lit(pos) if isinstance(pos, int) else pos

Expand Down Expand Up @@ -13139,35 +13249,90 @@ def array_compact(col: "ColumnOrName") -> Column:
@_try_remote_functions
def array_append(col: "ColumnOrName", value: Any) -> Column:
"""
Collection function: returns an array of the elements in col1 along
with the added element in col2 at the last of the array.
Array function: returns a new array column by appending `value` to the existing array `col`.

.. versionadded:: 3.4.0

Parameters
----------
col : :class:`~pyspark.sql.Column` or str
name of column containing array
The name of the column containing the array.
value :
a literal value, or a :class:`~pyspark.sql.Column` expression.
A literal value, or a :class:`~pyspark.sql.Column` expression to be appended to the array.

Returns
-------
:class:`~pyspark.sql.Column`
an array of values from first array along with the element.
A new array column with `value` appended to the original array.

Notes
-----
Supports Spark Connect.

Examples
--------
>>> from pyspark.sql import Row
Example 1: Appending a column value to an array column

>>> from pyspark.sql import Row, functions as sf
>>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2="c")])
>>> df.select(array_append(df.c1, df.c2)).collect()
[Row(array_append(c1, c2)=['b', 'a', 'c', 'c'])]
>>> df.select(array_append(df.c1, 'x')).collect()
[Row(array_append(c1, x)=['b', 'a', 'c', 'x'])]
>>> df.select(sf.array_append(df.c1, df.c2)).show()
+--------------------+
|array_append(c1, c2)|
+--------------------+
| [b, a, c, c]|
+--------------------+

Example 2: Appending a numeric value to an array column

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 2, 3],)], ['data'])
>>> df.select(sf.array_append(df.data, 4)).show()
+---------------------+
|array_append(data, 4)|
+---------------------+
| [1, 2, 3, 4]|
+---------------------+

Example 3: Appending a null value to an array column

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([([1, 2, 3],)], ['data'])
>>> df.select(sf.array_append(df.data, None)).show()
+------------------------+
|array_append(data, NULL)|
+------------------------+
| [1, 2, 3, NULL]|
+------------------------+

Example 4: Appending a value to a NULL array column

>>> from pyspark.sql import functions as sf
>>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField
>>> schema = StructType([
... StructField("data", ArrayType(IntegerType()), True)
... ])
>>> df = spark.createDataFrame([(None,)], schema=schema)
>>> df.select(sf.array_append(df.data, 4)).show()
+---------------------+
|array_append(data, 4)|
+---------------------+
| NULL|
+---------------------+

Example 5: Appending a value to an empty array

>>> from pyspark.sql import functions as sf
>>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField
>>> schema = StructType([
... StructField("data", ArrayType(IntegerType()), True)
... ])
>>> df = spark.createDataFrame([([],)], schema=schema)
>>> df.select(sf.array_append(df.data, 1)).show()
+---------------------+
|array_append(data, 1)|
+---------------------+
| [1]|
+---------------------+
"""
return _invoke_function_over_columns("array_append", col, lit(value))

Expand Down