diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json index ffe3c61737e6..edaebd7420e0 100644 --- a/python/pyspark/errors/error-conditions.json +++ b/python/pyspark/errors/error-conditions.json @@ -1374,6 +1374,11 @@ "Value for `` must be between and (inclusive), got " ] }, + "WRONG_NUM_ARGS": { + "message": [ + "Function `` expects but got ." + ] + }, "WRONG_NUM_ARGS_FOR_HIGHER_ORDER_FUNCTION": { "message": [ "Function `` should take between 1 and 3 arguments, but the provided function takes ." diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py index 0380b517e6e5..1650a19a740e 100644 --- a/python/pyspark/sql/connect/functions/builtin.py +++ b/python/pyspark/sql/connect/functions/builtin.py @@ -4019,6 +4019,7 @@ def try_make_timestamp_ltz( try_make_timestamp_ltz.__doc__ = pysparkfuncs.try_make_timestamp_ltz.__doc__ +@overload def make_timestamp_ntz( years: "ColumnOrName", months: "ColumnOrName", @@ -4027,9 +4028,35 @@ def make_timestamp_ntz( mins: "ColumnOrName", secs: "ColumnOrName", ) -> Column: - return _invoke_function_over_columns( - "make_timestamp_ntz", years, months, days, hours, mins, secs - ) + ... + + +@overload +def make_timestamp_ntz(date: "ColumnOrName", time: "ColumnOrName") -> Column: + ... + + +def make_timestamp_ntz(*cols: "ColumnOrName") -> Column: + if len(cols) == 2: + # make_timestamp_ntz(date, time) + date, time = cols + return _invoke_function_over_columns("make_timestamp_ntz", date, time) + elif len(cols) == 6: + # make_timestamp_ntz(years, months, days, hours, mins, secs) + years, months, days, hours, mins, secs = cols + return _invoke_function_over_columns( + "make_timestamp_ntz", years, months, days, hours, mins, secs + ) + else: + # Invalid number of arguments + raise PySparkValueError( + errorClass="WRONG_NUM_ARGS", + messageParameters={ + "func_name": "make_timestamp_ntz", + "expected": "either (years, months, days, hours, mins, secs) or (date, time)", + "actual": f"{len(cols)} columns", + }, + ) make_timestamp_ntz.__doc__ = pysparkfuncs.make_timestamp_ntz.__doc__ diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index b09713e0c289..455d2568c5eb 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -25126,7 +25126,7 @@ def try_make_timestamp_ltz( ) -@_try_remote_functions +@overload def make_timestamp_ntz( years: "ColumnOrName", months: "ColumnOrName", @@ -25135,30 +25135,59 @@ def make_timestamp_ntz( mins: "ColumnOrName", secs: "ColumnOrName", ) -> Column: + ... + + +@overload +def make_timestamp_ntz(date: "ColumnOrName", time: "ColumnOrName") -> Column: + ... + + +@_try_remote_functions +def make_timestamp_ntz(*cols: "ColumnOrName") -> Column: """ - Create local date-time from years, months, days, hours, mins, secs fields. + Create local date-time from years, months, days, hours, mins, secs fields, or from + date and time fields. + If there are 6 cols, then this creates a timestamp from individual time components. + If there are 2 cols, then this creates a timestamp from date and time. If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL on invalid inputs. Otherwise, it will throw an error instead. .. versionadded:: 3.5.0 + .. versionchanged:: 3.4.0 + Supports Spark Connect. + Parameters ---------- - years : :class:`~pyspark.sql.Column` or column name - The year to represent, from 1 to 9999 - months : :class:`~pyspark.sql.Column` or column name - The month-of-year to represent, from 1 (January) to 12 (December) - days : :class:`~pyspark.sql.Column` or column name - The day-of-month to represent, from 1 to 31 - hours : :class:`~pyspark.sql.Column` or column name - The hour-of-day to represent, from 0 to 23 - mins : :class:`~pyspark.sql.Column` or column name - The minute-of-hour to represent, from 0 to 59 - secs : :class:`~pyspark.sql.Column` or column name - The second-of-minute and its micro-fraction to represent, from 0 to 60. - The value can be either an integer like 13 , or a fraction like 13.123. - If the sec argument equals to 60, the seconds field is set - to 0 and 1 minute is added to the final timestamp. + cols : :class:`~pyspark.sql.Column` or column name + Either 6 columns (years, months, days, hours, mins, secs) + Or 2 columns (date, time) + + years : :class:`~pyspark.sql.Column` or column name + The year to represent, from 1 to 9999 + months : :class:`~pyspark.sql.Column` or column name + The month-of-year to represent, from 1 (January) to 12 (December) + days : :class:`~pyspark.sql.Column` or column name + The day-of-month to represent, from 1 to 31 + hours : :class:`~pyspark.sql.Column` or column name + The hour-of-day to represent, from 0 to 23 + mins : :class:`~pyspark.sql.Column` or column name + The minute-of-hour to represent, from 0 to 59 + secs : :class:`~pyspark.sql.Column` or column name + The second-of-minute and its micro-fraction to represent, from 0 to 60. + The value can be either an integer like 13, or a fraction like 13.123. + If the sec argument equals to 60, the seconds field is set + to 0 and 1 minute is added to the final timestamp. + date : :class:`~pyspark.sql.Column` or column name + A date to represent, from 0001-01-01 to 9999-12-31 + time : :class:`~pyspark.sql.Column` or column name + A local time to represent, from 00:00:00 to 23:59:59.999999 + + Notes + ----- + This function accepts either 6 arguments (years, months, days, hours, mins, secs) + or 2 arguments (date, time). Returns ------- @@ -25179,6 +25208,8 @@ def make_timestamp_ntz( -------- >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") + Example 1: Create local date-time from year, month, day, hour, min, sec fields. + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]], ... ['year', 'month', 'day', 'hour', 'min', 'sec']) @@ -25191,11 +25222,41 @@ def make_timestamp_ntz( |2014-12-28 06:30:45.887 | +----------------------------------------------------+ + Example 2: Create local date-time from date and time fields. + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([['2014-12-28', '06:30:45.887']], + ... ['date_col', 'time_col']) + >>> df.select( + ... sf.make_timestamp_ntz(sf.to_date(df.date_col), sf.to_time(df.time_col)) + ... ).show(truncate=False) + +--------------------------------------------------------+ + |make_timestamp_ntz(to_date(date_col), to_time(time_col))| + +--------------------------------------------------------+ + |2014-12-28 06:30:45.887 | + +--------------------------------------------------------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ - return _invoke_function_over_columns( - "make_timestamp_ntz", years, months, days, hours, mins, secs - ) + if len(cols) == 2: + # make_timestamp_ntz(date, time) + date, time = cols + return _invoke_function_over_columns("make_timestamp_ntz", date, time) + elif len(cols) == 6: + years, months, days, hours, mins, secs = cols + return _invoke_function_over_columns( + "make_timestamp_ntz", years, months, days, hours, mins, secs + ) + else: + # Invalid number of arguments + raise PySparkValueError( + errorClass="WRONG_NUM_ARGS", + messageParameters={ + "func_name": "make_timestamp_ntz", + "expected": "either (years, months, days, hours, mins, secs) or (date, time)", + "actual": f"{len(cols)} columns", + }, + ) @_try_remote_functions diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 2fde3edc2486..7ef90a6b17ec 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -485,6 +485,64 @@ def test_try_make_timestamp_ntz(self): ) assertDataFrameEqual(actual, [Row(None)]) + def test_make_timestamp_ntz_with_date_time(self): + # Test make_timestamp_ntz(date=..., time=...) overload + from datetime import date, time + + # Test with date and time columns + data = [(date(2024, 5, 22), time(10, 30, 45))] + df = self.spark.createDataFrame(data, ["date_col", "time_col"]) + actual = df.select(F.make_timestamp_ntz(df.date_col, df.time_col)) + assertDataFrameEqual(actual, [Row(datetime.datetime(2024, 5, 22, 10, 30, 45))]) + + # Test with to_date and to_time functions + data = [("2024-05-22", "10:30:45.123")] + df = self.spark.createDataFrame(data, ["date_str", "time_str"]) + actual = df.select(F.make_timestamp_ntz(F.to_date(df.date_str), F.to_time(df.time_str))) + assertDataFrameEqual(actual, [Row(datetime.datetime(2024, 5, 22, 10, 30, 45, 123000))]) + + def test_make_timestamp_ntz_error_handling(self): + # Test error handling for wrong number of arguments + with self.assertRaises(PySparkValueError) as pe: + F.make_timestamp_ntz() # No arguments + + self.check_error( + exception=pe.exception, + errorClass="WRONG_NUM_ARGS", + messageParameters={ + "func_name": "make_timestamp_ntz", + "expected": "either (years, months, days, hours, mins, secs) or (date, time)", + "actual": "0 columns", + }, + ) + + with self.assertRaises(PySparkValueError) as pe: + F.make_timestamp_ntz(F.lit(2024)) # Only 1 argument + + self.check_error( + exception=pe.exception, + errorClass="WRONG_NUM_ARGS", + messageParameters={ + "func_name": "make_timestamp_ntz", + "expected": "either (years, months, days, hours, mins, secs) or (date, time)", + "actual": "1 columns", + }, + ) + + # Test invalid number of arguments (3 arguments) + with self.assertRaises(PySparkValueError) as pe: + F.make_timestamp_ntz(F.lit(2024), F.lit(1), F.lit(1)) + + self.check_error( + exception=pe.exception, + errorClass="WRONG_NUM_ARGS", + messageParameters={ + "func_name": "make_timestamp_ntz", + "expected": "either (years, months, days, hours, mins, secs) or (date, time)", + "actual": "3 columns", + }, + ) + def test_string_functions(self): string_functions = [ "upper",