From 727a981575ecce61d78b84215c70f3ba024fe486 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Thu, 21 Dec 2023 14:36:53 +0800 Subject: [PATCH 1/5] init --- python/pyspark/sql/functions/builtin.py | 215 +++++++++++++++++++++--- 1 file changed, 189 insertions(+), 26 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 54a91792404d..05831114be74 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -12875,9 +12875,8 @@ def get(col: "ColumnOrName", index: Union["ColumnOrName", int]) -> Column: @_try_remote_functions def array_prepend(col: "ColumnOrName", value: Any) -> Column: """ - Collection function: Returns an array containing element as - well as all elements from array. The new element is positioned - at the beginning of the array. + Array function: Returns an array containing the given element as + the first element and the rest of the elements from the original array. .. versionadded:: 3.5.0 @@ -12891,13 +12890,72 @@ def array_prepend(col: "ColumnOrName", value: Any) -> Column: Returns ------- :class:`~pyspark.sql.Column` - an array excluding given value. + an array with the given value prepended. Examples -------- - >>> df = spark.createDataFrame([([2, 3, 4],), ([],)], ['data']) - >>> df.select(array_prepend(df.data, 1)).collect() - [Row(array_prepend(data, 1)=[1, 2, 3, 4]), Row(array_prepend(data, 1)=[1])] + Example 1: Prepending a column value to an array column + + >>> from pyspark.sql import Row, functions as sf + >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2="c")]) + >>> df.select(sf.array_prepend(df.c1, df.c2)).show() + +---------------------+ + |array_prepend(c1, c2)| + +---------------------+ + | [c, b, a, c]| + +---------------------+ + + Example 2: Prepending a numeric value to an array column + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) + >>> df.select(sf.array_prepend(df.data, 4)).show() + +----------------------+ + |array_prepend(data, 4)| + +----------------------+ + | [4, 1, 2, 3]| + +----------------------+ + + Example 3: Prepending a null value to an array column + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) + >>> df.select(sf.array_prepend(df.data, None)).show() + +-------------------------+ + |array_prepend(data, NULL)| + +-------------------------+ + | [NULL, 1, 2, 3]| + +-------------------------+ + + Example 4: Prepending a value to a NULL array column + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField + >>> schema = StructType([ + ... StructField("data", ArrayType(IntegerType()), True) + ... ]) + >>> df = spark.createDataFrame([(None,)], schema=schema) + >>> df.select(sf.array_prepend(df.data, 4)).show() + +----------------------+ + |array_prepend(data, 4)| + +----------------------+ + | NULL| + +----------------------+ + + Example 5: Prepending a value to an empty array + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField + >>> schema = StructType([ + ... StructField("data", ArrayType(IntegerType()), True) + ... ]) + >>> df = spark.createDataFrame([([],)], schema=schema) + >>> df.select(sf.array_prepend(df.data, 1)).show() + +----------------------+ + |array_prepend(data, 1)| + +----------------------+ + | [1]| + +----------------------+ """ return _invoke_function_over_columns("array_prepend", col, lit(value)) @@ -12965,7 +13023,7 @@ def array_distinct(col: "ColumnOrName") -> Column: @_try_remote_functions def array_insert(arr: "ColumnOrName", pos: Union["ColumnOrName", int], value: Any) -> Column: """ - Collection function: adds an item into a given array at a specified array index. + Array function: Inserts an item into a given array at a specified array index. Array indices start at 1, or start from the end if index is negative. Index above array size appends the array, or prepends the array if index is negative, with 'null' elements. @@ -12993,14 +13051,65 @@ def array_insert(arr: "ColumnOrName", pos: Union["ColumnOrName", int], value: An Examples -------- - >>> df = spark.createDataFrame( - ... [(['a', 'b', 'c'], 2, 'd'), (['c', 'b', 'a'], -2, 'd')], - ... ['data', 'pos', 'val'] - ... ) - >>> df.select(array_insert(df.data, df.pos.cast('integer'), df.val).alias('data')).collect() - [Row(data=['a', 'd', 'b', 'c']), Row(data=['c', 'b', 'd', 'a'])] - >>> df.select(array_insert(df.data, 5, 'hello').alias('data')).collect() - [Row(data=['a', 'b', 'c', None, 'hello']), Row(data=['c', 'b', 'a', None, 'hello'])] + Example 1: Inserting a value at a specific position + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data']) + >>> df.select(sf.array_insert(df.data, 2, 'd')).show() + +------------------------+ + |array_insert(data, 2, d)| + +------------------------+ + | [a, d, b, c]| + +------------------------+ + + Example 2: Inserting a value at a negative position + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data']) + >>> df.select(sf.array_insert(df.data, -2, 'd')).show() + +-------------------------+ + |array_insert(data, -2, d)| + +-------------------------+ + | [a, b, d, c]| + +-------------------------+ + + Example 3: Inserting a value at a position greater than the array size + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data']) + >>> df.select(sf.array_insert(df.data, 5, 'e')).show() + +------------------------+ + |array_insert(data, 5, e)| + +------------------------+ + | [a, b, c, NULL, e]| + +------------------------+ + + Example 4: Inserting a NULL value + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import StringType + >>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data']) + >>> df.select(sf.array_insert(df.data, 2, sf.lit(None).cast(StringType()))).show() + +-------------------------------------------+ + |array_insert(data, 2, CAST(NULL AS STRING))| + +-------------------------------------------+ + | [a, NULL, b, c]| + +-------------------------------------------+ + + Example 5: Inserting a value into a NULL array + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField + >>> schema = StructType([ + ... StructField("data", ArrayType(IntegerType()), True) + ... ]) + >>> df = spark.createDataFrame([(None,)], schema=schema) + >>> df.select(sf.array_insert(df.data, 1, 5)).show() + +------------------------+ + |array_insert(data, 1, 5)| + +------------------------+ + | NULL| + +------------------------+ """ pos = lit(pos) if isinstance(pos, int) else pos @@ -13139,22 +13248,21 @@ def array_compact(col: "ColumnOrName") -> Column: @_try_remote_functions def array_append(col: "ColumnOrName", value: Any) -> Column: """ - Collection function: returns an array of the elements in col1 along - with the added element in col2 at the last of the array. + Array function: returns a new array column by appending `value` to the existing array `col`. .. versionadded:: 3.4.0 Parameters ---------- col : :class:`~pyspark.sql.Column` or str - name of column containing array + The name of the column containing the array. value : - a literal value, or a :class:`~pyspark.sql.Column` expression. + A literal value, or a :class:`~pyspark.sql.Column` expression to be appended to the array. Returns ------- :class:`~pyspark.sql.Column` - an array of values from first array along with the element. + A new array column with `value` appended to the original array. Notes ----- @@ -13162,12 +13270,67 @@ def array_append(col: "ColumnOrName", value: Any) -> Column: Examples -------- - >>> from pyspark.sql import Row + Example 1: Appending a column value to an array column + + >>> from pyspark.sql import Row, functions as sf >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2="c")]) - >>> df.select(array_append(df.c1, df.c2)).collect() - [Row(array_append(c1, c2)=['b', 'a', 'c', 'c'])] - >>> df.select(array_append(df.c1, 'x')).collect() - [Row(array_append(c1, x)=['b', 'a', 'c', 'x'])] + >>> df.select(sf.array_append(df.c1, df.c2)).show() + +--------------------+ + |array_append(c1, c2)| + +--------------------+ + | [b, a, c, c]| + +--------------------+ + + Example 2: Appending a numeric value to an array column + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) + >>> df.select(sf.array_append(df.data, 4)).show() + +---------------------+ + |array_append(data, 4)| + +---------------------+ + | [1, 2, 3, 4]| + +---------------------+ + + Example 3: Appending a null value to an array column + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) + >>> df.select(sf.array_append(df.data, None)).show() + +------------------------+ + |array_append(data, NULL)| + +------------------------+ + | [1, 2, 3, NULL]| + +------------------------+ + + Example 4: Appending a value to a NULL array column + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField + >>> schema = StructType([ + ... StructField("data", ArrayType(IntegerType()), True) + ... ]) + >>> df = spark.createDataFrame([(None,)], schema=schema) + >>> df.select(sf.array_append(df.data, 4)).show() + +---------------------+ + |array_append(data, 4)| + +---------------------+ + | NULL| + +---------------------+ + + Example 5: Appending a value to an empty array + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField + >>> schema = StructType([ + ... StructField("data", ArrayType(IntegerType()), True) + ... ]) + >>> df = spark.createDataFrame([([],)], schema=schema) + >>> df.select(sf.array_append(df.data, 1)).show() + +---------------------+ + |array_append(data, 1)| + +---------------------+ + | [1]| + +---------------------+ """ return _invoke_function_over_columns("array_append", col, lit(value)) From f1fa3bbaa5d0928d801da643acb19655b6d8cbab Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Thu, 21 Dec 2023 14:41:39 +0800 Subject: [PATCH 2/5] check ansi is true first --- .github/workflows/build_and_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bdcb1dd1ea5c..f28371488c07 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -379,6 +379,7 @@ jobs: SKIP_PACKAGING: true METASPACE_SIZE: 1g BRANCH: ${{ inputs.branch }} + SPARK_ANSI_SQL_MODE: true steps: - name: Checkout Spark repository uses: actions/checkout@v4 From ccfb9a121054efbcfab838dc723897ba471c9031 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Thu, 21 Dec 2023 15:46:26 +0800 Subject: [PATCH 3/5] fix --- python/pyspark/sql/functions/builtin.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 05831114be74..640904000822 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -13090,11 +13090,11 @@ def array_insert(arr: "ColumnOrName", pos: Union["ColumnOrName", int], value: An >>> from pyspark.sql.types import StringType >>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data']) >>> df.select(sf.array_insert(df.data, 2, sf.lit(None).cast(StringType()))).show() - +-------------------------------------------+ - |array_insert(data, 2, CAST(NULL AS STRING))| - +-------------------------------------------+ - | [a, NULL, b, c]| - +-------------------------------------------+ + +---------------------------+ + |array_insert(data, 2, NULL)| + +---------------------------+ + | [a, NULL, b, c]| + +---------------------------+ Example 5: Inserting a value into a NULL array @@ -13319,6 +13319,7 @@ def array_append(col: "ColumnOrName", value: Any) -> Column: +---------------------+ Example 5: Appending a value to an empty array + >>> from pyspark.sql import functions as sf >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField >>> schema = StructType([ From 0f05ea5739272a6008698512370b0e12ffb9e908 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Thu, 21 Dec 2023 16:05:41 +0800 Subject: [PATCH 4/5] use alias --- python/pyspark/sql/functions/builtin.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 640904000822..571572df30aa 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -13089,12 +13089,13 @@ def array_insert(arr: "ColumnOrName", pos: Union["ColumnOrName", int], value: An >>> from pyspark.sql import functions as sf >>> from pyspark.sql.types import StringType >>> df = spark.createDataFrame([(['a', 'b', 'c'],)], ['data']) - >>> df.select(sf.array_insert(df.data, 2, sf.lit(None).cast(StringType()))).show() - +---------------------------+ - |array_insert(data, 2, NULL)| - +---------------------------+ - | [a, NULL, b, c]| - +---------------------------+ + >>> df.select(sf.array_insert(df.data, 2, sf.lit(None).cast(StringType())) + ... .alias("result")).show() + +---------------+ + | result| + +---------------+ + |[a, NULL, b, c]| + +---------------+ Example 5: Inserting a value into a NULL array From ef16dace720d386a0f500035e53406b353a6fcd5 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Thu, 21 Dec 2023 20:49:00 +0800 Subject: [PATCH 5/5] revert build and test yml --- .github/workflows/build_and_test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index f28371488c07..bdcb1dd1ea5c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -379,7 +379,6 @@ jobs: SKIP_PACKAGING: true METASPACE_SIZE: 1g BRANCH: ${{ inputs.branch }} - SPARK_ANSI_SQL_MODE: true steps: - name: Checkout Spark repository uses: actions/checkout@v4