diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py index 0380b517e6e5..e98a975c27e3 100644 --- a/python/pyspark/sql/connect/functions/builtin.py +++ b/python/pyspark/sql/connect/functions/builtin.py @@ -4019,6 +4019,7 @@ def try_make_timestamp_ltz( try_make_timestamp_ltz.__doc__ = pysparkfuncs.try_make_timestamp_ltz.__doc__ +@overload def make_timestamp_ntz( years: "ColumnOrName", months: "ColumnOrName", @@ -4027,14 +4028,59 @@ def make_timestamp_ntz( mins: "ColumnOrName", secs: "ColumnOrName", ) -> Column: - return _invoke_function_over_columns( - "make_timestamp_ntz", years, months, days, hours, mins, secs - ) + ... + + +@overload +def make_timestamp_ntz( + *, + date: "ColumnOrName", + time: "ColumnOrName", +) -> Column: + ... + + +def make_timestamp_ntz( + years: Optional["ColumnOrName"] = None, + months: Optional["ColumnOrName"] = None, + days: Optional["ColumnOrName"] = None, + hours: Optional["ColumnOrName"] = None, + mins: Optional["ColumnOrName"] = None, + secs: Optional["ColumnOrName"] = None, + *, + date: Optional["ColumnOrName"] = None, + time: Optional["ColumnOrName"] = None, +) -> Column: + if years is not None: + if any(arg is not None for arg in [date, time]): + raise PySparkValueError( + errorClass="CANNOT_SET_TOGETHER", + messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, + ) + return _invoke_function_over_columns( + "make_timestamp_ntz", + cast("ColumnOrName", years), + cast("ColumnOrName", months), + cast("ColumnOrName", days), + cast("ColumnOrName", hours), + cast("ColumnOrName", mins), + cast("ColumnOrName", secs), + ) + else: + if any(arg is not None for arg in [years, months, days, hours, mins, secs]): + raise PySparkValueError( + errorClass="CANNOT_SET_TOGETHER", + messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, + ) + return _invoke_function_over_columns( + "make_timestamp_ntz", cast("ColumnOrName", date), cast("ColumnOrName", time) + ) make_timestamp_ntz.__doc__ = pysparkfuncs.make_timestamp_ntz.__doc__ +@overload def try_make_timestamp_ntz( years: "ColumnOrName", months: "ColumnOrName", @@ -4043,9 +4089,53 @@ def try_make_timestamp_ntz( mins: "ColumnOrName", secs: "ColumnOrName", ) -> Column: - return _invoke_function_over_columns( - "try_make_timestamp_ntz", years, months, days, hours, mins, secs - ) + ... + + +@overload +def try_make_timestamp_ntz( + *, + date: "ColumnOrName", + time: "ColumnOrName", +) -> Column: + ... + + +def try_make_timestamp_ntz( + years: Optional["ColumnOrName"] = None, + months: Optional["ColumnOrName"] = None, + days: Optional["ColumnOrName"] = None, + hours: Optional["ColumnOrName"] = None, + mins: Optional["ColumnOrName"] = None, + secs: Optional["ColumnOrName"] = None, + *, + date: Optional["ColumnOrName"] = None, + time: Optional["ColumnOrName"] = None, +) -> Column: + if years is not None: + if any(arg is not None for arg in [date, time]): + raise PySparkValueError( + errorClass="CANNOT_SET_TOGETHER", + messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, + ) + return _invoke_function_over_columns( + "try_make_timestamp_ntz", + cast("ColumnOrName", years), + cast("ColumnOrName", months), + cast("ColumnOrName", days), + cast("ColumnOrName", hours), + cast("ColumnOrName", mins), + cast("ColumnOrName", secs), + ) + else: + if any(arg is not None for arg in [years, months, days, hours, mins, secs]): + raise PySparkValueError( + errorClass="CANNOT_SET_TOGETHER", + messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, + ) + return _invoke_function_over_columns( + "try_make_timestamp_ntz", cast("ColumnOrName", date), cast("ColumnOrName", time) + ) try_make_timestamp_ntz.__doc__ = pysparkfuncs.try_make_timestamp_ntz.__doc__ diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 0bec14d10d44..9b4557f16365 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -25126,7 +25126,7 @@ def try_make_timestamp_ltz( ) -@_try_remote_functions +@overload def make_timestamp_ntz( years: "ColumnOrName", months: "ColumnOrName", @@ -25134,31 +25134,78 @@ def make_timestamp_ntz( hours: "ColumnOrName", mins: "ColumnOrName", secs: "ColumnOrName", +) -> Column: + ... + + +@overload +def make_timestamp_ntz( + *, + date: "ColumnOrName", + time: "ColumnOrName", +) -> Column: + ... + + +@_try_remote_functions +def make_timestamp_ntz( + years: Optional["ColumnOrName"] = None, + months: Optional["ColumnOrName"] = None, + days: Optional["ColumnOrName"] = None, + hours: Optional["ColumnOrName"] = None, + mins: Optional["ColumnOrName"] = None, + secs: Optional["ColumnOrName"] = None, + *, + date: Optional["ColumnOrName"] = None, + time: Optional["ColumnOrName"] = None, ) -> Column: """ - Create local date-time from years, months, days, hours, mins, secs fields. - If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL - on invalid inputs. Otherwise, it will throw an error instead. + Create local date-time from years, months, days, hours, mins, secs fields. Alternatively, try to + create local date-time from date and time fields. If the configuration `spark.sql.ansi.enabled` + is false, the function returns NULL on invalid inputs. Otherwise, it will throw an error. .. versionadded:: 3.5.0 + .. versionchanged:: 4.1.0 + Added support for creating timestamps from date and time. + Parameters ---------- - years : :class:`~pyspark.sql.Column` or column name - The year to represent, from 1 to 9999 - months : :class:`~pyspark.sql.Column` or column name - The month-of-year to represent, from 1 (January) to 12 (December) - days : :class:`~pyspark.sql.Column` or column name - The day-of-month to represent, from 1 to 31 - hours : :class:`~pyspark.sql.Column` or column name - The hour-of-day to represent, from 0 to 23 - mins : :class:`~pyspark.sql.Column` or column name - The minute-of-hour to represent, from 0 to 59 - secs : :class:`~pyspark.sql.Column` or column name + years : :class:`~pyspark.sql.Column` or column name, optional + The year to represent, from 1 to 9999. + Required when creating timestamps from individual components. + Must be used with months, days, hours, mins, and secs. + months : :class:`~pyspark.sql.Column` or column name, optional + The month-of-year to represent, from 1 (January) to 12 (December). + Required when creating timestamps from individual components. + Must be used with years, days, hours, mins, and secs. + days : :class:`~pyspark.sql.Column` or column name, optional + The day-of-month to represent, from 1 to 31. + Required when creating timestamps from individual components. + Must be used with years, months, hours, mins, and secs. + hours : :class:`~pyspark.sql.Column` or column name, optional + The hour-of-day to represent, from 0 to 23. + Required when creating timestamps from individual components. + Must be used with years, months, days, mins, and secs. + mins : :class:`~pyspark.sql.Column` or column name, optional + The minute-of-hour to represent, from 0 to 59. + Required when creating timestamps from individual components. + Must be used with years, months, days, hours, and secs. + secs : :class:`~pyspark.sql.Column` or column name, optional The second-of-minute and its micro-fraction to represent, from 0 to 60. - The value can be either an integer like 13 , or a fraction like 13.123. + The value can be either an integer like 13, or a fraction like 13.123. If the sec argument equals to 60, the seconds field is set to 0 and 1 minute is added to the final timestamp. + Required when creating timestamps from individual components. + Must be used with years, months, days, hours, and mins. + date : :class:`~pyspark.sql.Column` or column name, optional + The date to represent, in valid DATE format. + Required when creating timestamps from date and time components. + Must be used with time parameter only. + time : :class:`~pyspark.sql.Column` or column name, optional + The time to represent, in valid TIME format. + Required when creating timestamps from date and time components. + Must be used with date parameter only. Returns ------- @@ -25179,6 +25226,8 @@ def make_timestamp_ntz( -------- >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") + Example 1: Make local date-time from years, months, days, hours, mins, secs. + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]], ... ['year', 'month', 'day', 'hour', 'min', 'sec']) @@ -25191,14 +25240,50 @@ def make_timestamp_ntz( |2014-12-28 06:30:45.887 | +----------------------------------------------------+ + Example 2: Make local date-time from date and time. + + >>> import pyspark.sql.functions as sf + >>> from datetime import date, time + >>> df = spark.range(1).select( + ... sf.lit(date(2014, 12, 28)).alias("date"), + ... sf.lit(time(6, 30, 45, 887000)).alias("time") + ... ) + >>> df.select(sf.make_timestamp_ntz(date=df.date, time=df.time)).show(truncate=False) + +------------------------------+ + |make_timestamp_ntz(date, time)| + +------------------------------+ + |2014-12-28 06:30:45.887 | + +------------------------------+ + >>> spark.conf.unset("spark.sql.session.timeZone") """ - return _invoke_function_over_columns( - "make_timestamp_ntz", years, months, days, hours, mins, secs - ) + if years is not None: + if any(arg is not None for arg in [date, time]): + raise PySparkValueError( + errorClass="CANNOT_SET_TOGETHER", + messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, + ) + return _invoke_function_over_columns( + "make_timestamp_ntz", + cast("ColumnOrName", years), + cast("ColumnOrName", months), + cast("ColumnOrName", days), + cast("ColumnOrName", hours), + cast("ColumnOrName", mins), + cast("ColumnOrName", secs), + ) + else: + if any(arg is not None for arg in [years, months, days, hours, mins, secs]): + raise PySparkValueError( + errorClass="CANNOT_SET_TOGETHER", + messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, + ) + return _invoke_function_over_columns( + "make_timestamp_ntz", cast("ColumnOrName", date), cast("ColumnOrName", time) + ) -@_try_remote_functions +@overload def try_make_timestamp_ntz( years: "ColumnOrName", months: "ColumnOrName", @@ -25206,30 +25291,78 @@ def try_make_timestamp_ntz( hours: "ColumnOrName", mins: "ColumnOrName", secs: "ColumnOrName", +) -> Column: + ... + + +@overload +def try_make_timestamp_ntz( + *, + date: "ColumnOrName", + time: "ColumnOrName", +) -> Column: + ... + + +@_try_remote_functions +def try_make_timestamp_ntz( + years: Optional["ColumnOrName"] = None, + months: Optional["ColumnOrName"] = None, + days: Optional["ColumnOrName"] = None, + hours: Optional["ColumnOrName"] = None, + mins: Optional["ColumnOrName"] = None, + secs: Optional["ColumnOrName"] = None, + *, + date: Optional["ColumnOrName"] = None, + time: Optional["ColumnOrName"] = None, ) -> Column: """ - Try to create local date-time from years, months, days, hours, mins, secs fields. - The function returns NULL on invalid inputs. + Try to create local date-time from years, months, days, hours, mins, secs fields. Alternatively, + try to create local date-time from date and time fields. The function returns NULL on invalid + inputs. .. versionadded:: 4.0.0 + .. versionchanged:: 4.1.0 + Added support for creating timestamps from date and time. + Parameters ---------- - years : :class:`~pyspark.sql.Column` or column name - The year to represent, from 1 to 9999 - months : :class:`~pyspark.sql.Column` or column name - The month-of-year to represent, from 1 (January) to 12 (December) - days : :class:`~pyspark.sql.Column` or column name - The day-of-month to represent, from 1 to 31 - hours : :class:`~pyspark.sql.Column` or column name - The hour-of-day to represent, from 0 to 23 - mins : :class:`~pyspark.sql.Column` or column name - The minute-of-hour to represent, from 0 to 59 - secs : :class:`~pyspark.sql.Column` or column name + years : :class:`~pyspark.sql.Column` or column name, optional + The year to represent, from 1 to 9999. + Required when creating timestamps from individual components. + Must be used with months, days, hours, mins, and secs. + months : :class:`~pyspark.sql.Column` or column name, optional + The month-of-year to represent, from 1 (January) to 12 (December). + Required when creating timestamps from individual components. + Must be used with years, days, hours, mins, and secs. + days : :class:`~pyspark.sql.Column` or column name, optional + The day-of-month to represent, from 1 to 31. + Required when creating timestamps from individual components. + Must be used with years, months, hours, mins, and secs. + hours : :class:`~pyspark.sql.Column` or column name, optional + The hour-of-day to represent, from 0 to 23. + Required when creating timestamps from individual components. + Must be used with years, months, days, mins, and secs. + mins : :class:`~pyspark.sql.Column` or column name, optional + The minute-of-hour to represent, from 0 to 59. + Required when creating timestamps from individual components. + Must be used with years, months, days, hours, and secs. + secs : :class:`~pyspark.sql.Column` or column name, optional The second-of-minute and its micro-fraction to represent, from 0 to 60. - The value can be either an integer like 13 , or a fraction like 13.123. + The value can be either an integer like 13, or a fraction like 13.123. If the sec argument equals to 60, the seconds field is set to 0 and 1 minute is added to the final timestamp. + Required when creating timestamps from individual components. + Must be used with years, months, days, hours, and mins. + date : :class:`~pyspark.sql.Column` or column name, optional + The date to represent, in valid DATE format. + Required when creating timestamps from date and time components. + Must be used with time parameter only. + time : :class:`~pyspark.sql.Column` or column name, optional + The time to represent, in valid TIME format. + Required when creating timestamps from date and time components. + Must be used with date parameter only. Returns ------- @@ -25280,9 +25413,30 @@ def try_make_timestamp_ntz( >>> spark.conf.unset("spark.sql.session.timeZone") """ - return _invoke_function_over_columns( - "try_make_timestamp_ntz", years, months, days, hours, mins, secs - ) + if years is not None: + if any(arg is not None for arg in [date, time]): + raise PySparkValueError( + errorClass="CANNOT_SET_TOGETHER", + messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, + ) + return _invoke_function_over_columns( + "try_make_timestamp_ntz", + cast("ColumnOrName", years), + cast("ColumnOrName", months), + cast("ColumnOrName", days), + cast("ColumnOrName", hours), + cast("ColumnOrName", mins), + cast("ColumnOrName", secs), + ) + else: + if any(arg is not None for arg in [years, months, days, hours, mins, secs]): + raise PySparkValueError( + errorClass="CANNOT_SET_TOGETHER", + messageParameters={"arg_list": "years|months|days|hours|mins|secs and date|time"}, + ) + return _invoke_function_over_columns( + "try_make_timestamp_ntz", cast("ColumnOrName", date), cast("ColumnOrName", time) + ) @_try_remote_functions diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 8e01f5d170ff..5e27f742eb63 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -468,13 +468,18 @@ def test_try_make_timestamp_ltz(self): assertDataFrameEqual(actual, [Row(None)]) def test_try_make_timestamp_ntz(self): + """Test cases for try_make_timestamp_ntz with 6-parameter and date/time forms.""" + + # Test 1: Valid 6 positional arguments data = [(2024, 5, 22, 10, 30, 0)] + result = datetime.datetime(2024, 5, 22, 10, 30) df = self.spark.createDataFrame(data, ["year", "month", "day", "hour", "minute", "second"]) actual = df.select( F.try_make_timestamp_ntz(df.year, df.month, df.day, df.hour, df.minute, df.second) ) - assertDataFrameEqual(actual, [Row(datetime.datetime(2024, 5, 22, 10, 30))]) + assertDataFrameEqual(actual, [Row(result)]) + # Test 2: Invalid input (month=13) - should return NULL data = [(2024, 13, 22, 10, 30, 0)] df = self.spark.createDataFrame(data, ["year", "month", "day", "hour", "minute", "second"]) actual = df.select( @@ -482,6 +487,161 @@ def test_try_make_timestamp_ntz(self): ) assertDataFrameEqual(actual, [Row(None)]) + # Test 3: Date/time keyword arguments + df = self.spark.range(1).select( + F.lit(datetime.date(2024, 5, 22)).alias("date"), + F.lit(datetime.time(10, 30, 0)).alias("time"), + ) + actual = df.select(F.try_make_timestamp_ntz(date=df.date, time=df.time)) + assertDataFrameEqual(actual, [Row(result)]) + + # Test 4: All 6 keyword arguments + df_full = self.spark.createDataFrame( + [(2024, 5, 22, 10, 30, 45)], ["year", "month", "day", "hour", "minute", "second"] + ) + actual = df_full.select( + F.try_make_timestamp_ntz( + years=df_full.year, + months=df_full.month, + days=df_full.day, + hours=df_full.hour, + mins=df_full.minute, + secs=df_full.second, + ) + ) + expected = datetime.datetime(2024, 5, 22, 10, 30, 45) + assertDataFrameEqual(actual, [Row(expected)]) + + # Test 5: Only year provided - should raise Exception for missing required parameters + with self.assertRaises(Exception): + F.try_make_timestamp_ntz(years=df_full.year) + + # Test 6: Partial parameters - should raise Exception for missing required parameters + with self.assertRaises(Exception): + F.try_make_timestamp_ntz(years=df_full.year, months=df_full.month, days=df_full.day) + + # Test 7: Partial parameters - should raise Exception for missing required parameters + with self.assertRaises(Exception): + F.try_make_timestamp_ntz( + years=df_full.year, months=df_full.month, days=df_full.day, hours=df_full.hour + ) + + # Test 8: Fractional seconds + df_frac = self.spark.createDataFrame( + [(2024, 5, 22, 10, 30, 45.123)], ["year", "month", "day", "hour", "minute", "second"] + ) + actual = df_frac.select( + F.try_make_timestamp_ntz( + df_frac.year, + df_frac.month, + df_frac.day, + df_frac.hour, + df_frac.minute, + df_frac.second, + ) + ) + expected_frac = datetime.datetime(2024, 5, 22, 10, 30, 45, 123000) + assertDataFrameEqual(actual, [Row(expected_frac)]) + + # Test 9: Edge case - February 29 in leap year (full 6 parameters) + df_leap = self.spark.createDataFrame( + [(2024, 2, 29, 0, 0, 0)], ["year", "month", "day", "hour", "minute", "second"] + ) + actual = df_leap.select( + F.try_make_timestamp_ntz( + df_leap.year, + df_leap.month, + df_leap.day, + df_leap.hour, + df_leap.minute, + df_leap.second, + ) + ) + expected_leap = datetime.datetime(2024, 2, 29, 0, 0, 0) + assertDataFrameEqual(actual, [Row(expected_leap)]) + + # Test 10: Edge case - February 29 in non-leap year (should return NULL) + df_non_leap = self.spark.createDataFrame( + [(2023, 2, 29, 0, 0, 0)], ["year", "month", "day", "hour", "minute", "second"] + ) + actual = df_non_leap.select( + F.try_make_timestamp_ntz( + df_non_leap.year, + df_non_leap.month, + df_non_leap.day, + df_non_leap.hour, + df_non_leap.minute, + df_non_leap.second, + ) + ) + assertDataFrameEqual(actual, [Row(None)]) + + # Test 11: Minimum valid values (full 6 parameters) + df_min = self.spark.createDataFrame( + [(1, 1, 1, 0, 0, 0)], ["year", "month", "day", "hour", "minute", "second"] + ) + actual = df_min.select( + F.try_make_timestamp_ntz( + df_min.year, df_min.month, df_min.day, df_min.hour, df_min.minute, df_min.second + ) + ) + expected_min = datetime.datetime(1, 1, 1, 0, 0, 0) + assertDataFrameEqual(actual, [Row(expected_min)]) + + # Test 12: Maximum valid hour/minute/second + df_max_time = self.spark.createDataFrame( + [(2024, 5, 22, 23, 59, 59)], ["year", "month", "day", "hour", "minute", "second"] + ) + actual = df_max_time.select( + F.try_make_timestamp_ntz( + df_max_time.year, + df_max_time.month, + df_max_time.day, + df_max_time.hour, + df_max_time.minute, + df_max_time.second, + ) + ) + expected_max_time = datetime.datetime(2024, 5, 22, 23, 59, 59) + assertDataFrameEqual(actual, [Row(expected_max_time)]) + + # Test 13: Invalid hour (should return NULL) + df_invalid_hour = self.spark.createDataFrame( + [(2024, 5, 22, 25, 0, 0)], ["year", "month", "day", "hour", "minute", "second"] + ) + actual = df_invalid_hour.select( + F.try_make_timestamp_ntz( + df_invalid_hour.year, + df_invalid_hour.month, + df_invalid_hour.day, + df_invalid_hour.hour, + df_invalid_hour.minute, + df_invalid_hour.second, + ) + ) + assertDataFrameEqual(actual, [Row(None)]) + + # Test 14: Valid date/time combination with NULL date + df = self.spark.range(1).select( + F.lit(None).cast("date").alias("date"), F.lit(datetime.time(10, 30, 0)).alias("time") + ) + actual = df.select(F.try_make_timestamp_ntz(date=df.date, time=df.time)) + assertDataFrameEqual(actual, [Row(None)]) + + # Test 15: Valid date/time combination with NULL time + df = self.spark.range(1).select( + F.lit(datetime.date(2024, 5, 22)).alias("date"), F.lit(None).cast("time").alias("time") + ) + actual = df.select(F.try_make_timestamp_ntz(date=df.date, time=df.time)) + assertDataFrameEqual(actual, [Row(None)]) + + # Test 16: Mixed parameter usage should raise PySparkValueError + with self.assertRaises(PySparkValueError) as context: + F.try_make_timestamp_ntz(years=df_full.year, date=df_full.year) + error_msg = str(context.exception) + self.assertIn("CANNOT_SET_TOGETHER", error_msg) + self.assertIn("years|months|days|hours|mins|secs and date|time", error_msg) + def test_string_functions(self): string_functions = [ "upper", @@ -676,6 +836,218 @@ def test_make_time(self): self.assertIsInstance(row_from_name[0], datetime.time) self.assertEqual(row_from_name[0], result) + def test_make_timestamp_ntz(self): + """Comprehensive test cases for make_timestamp_ntz with various arguments and edge cases.""" + + # Test 1: Basic 6 positional arguments + data = [(2024, 5, 22, 10, 30, 0)] + result = datetime.datetime(2024, 5, 22, 10, 30) + df = self.spark.createDataFrame(data, ["year", "month", "day", "hour", "minute", "second"]) + + actual = df.select( + F.make_timestamp_ntz(df.year, df.month, df.day, df.hour, df.minute, df.second) + ) + assertDataFrameEqual(actual, [Row(result)]) + + # Test 2: All 6 keyword arguments + actual = df.select( + F.make_timestamp_ntz( + years=df.year, + months=df.month, + days=df.day, + hours=df.hour, + mins=df.minute, + secs=df.second, + ) + ) + assertDataFrameEqual(actual, [Row(result)]) + + # Test 3: Date/time keyword arguments + df_dt = self.spark.range(1).select( + F.lit(datetime.date(2024, 5, 22)).alias("date"), + F.lit(datetime.time(10, 30, 0)).alias("time"), + ) + actual = df_dt.select(F.make_timestamp_ntz(date=df_dt.date, time=df_dt.time)) + assertDataFrameEqual(actual, [Row(result)]) + + # Test 4: Fractional seconds with positional arguments + data_frac = [(2024, 5, 22, 10, 30, 45.123)] + result_frac = datetime.datetime(2024, 5, 22, 10, 30, 45, 123000) + df_frac = self.spark.createDataFrame( + data_frac, ["year", "month", "day", "hour", "minute", "second"] + ) + + actual = df_frac.select( + F.make_timestamp_ntz( + df_frac.year, + df_frac.month, + df_frac.day, + df_frac.hour, + df_frac.minute, + df_frac.second, + ) + ) + assertDataFrameEqual(actual, [Row(result_frac)]) + + # Test 5: Fractional seconds with keyword arguments + actual = df_frac.select( + F.make_timestamp_ntz( + years=df_frac.year, + months=df_frac.month, + days=df_frac.day, + hours=df_frac.hour, + mins=df_frac.minute, + secs=df_frac.second, + ) + ) + assertDataFrameEqual(actual, [Row(result_frac)]) + + # Test 6: Fractional seconds with date/time arguments + df_dt_frac = self.spark.range(1).select( + F.lit(datetime.date(2024, 5, 22)).alias("date"), + F.lit(datetime.time(10, 30, 45, 123000)).alias("time"), + ) + actual = df_dt_frac.select(F.make_timestamp_ntz(date=df_dt_frac.date, time=df_dt_frac.time)) + assertDataFrameEqual(actual, [Row(result_frac)]) + + # Test 7: Edge case - February 29 in leap year + df_leap = self.spark.createDataFrame( + [(2024, 2, 29, 0, 0, 0)], ["year", "month", "day", "hour", "minute", "second"] + ) + expected_leap = datetime.datetime(2024, 2, 29, 0, 0, 0) + actual = df_leap.select( + F.make_timestamp_ntz( + df_leap.year, + df_leap.month, + df_leap.day, + df_leap.hour, + df_leap.minute, + df_leap.second, + ) + ) + assertDataFrameEqual(actual, [Row(expected_leap)]) + + # Test 8: Maximum valid time values + df_max = self.spark.createDataFrame( + [(2024, 12, 31, 23, 59, 59)], ["year", "month", "day", "hour", "minute", "second"] + ) + expected_max = datetime.datetime(2024, 12, 31, 23, 59, 59) + actual = df_max.select( + F.make_timestamp_ntz( + df_max.year, df_max.month, df_max.day, df_max.hour, df_max.minute, df_max.second + ) + ) + assertDataFrameEqual(actual, [Row(expected_max)]) + + # Test 9: Minimum valid values + df_min = self.spark.createDataFrame( + [(1, 1, 1, 0, 0, 0)], ["year", "month", "day", "hour", "minute", "second"] + ) + expected_min = datetime.datetime(1, 1, 1, 0, 0, 0) + actual = df_min.select( + F.make_timestamp_ntz( + df_min.year, df_min.month, df_min.day, df_min.hour, df_min.minute, df_min.second + ) + ) + assertDataFrameEqual(actual, [Row(expected_min)]) + + # Test 10: Mixed positional and keyword (should work for valid combinations) + actual = df.select( + F.make_timestamp_ntz( + df.year, df.month, df.day, hours=df.hour, mins=df.minute, secs=df.second + ) + ) + assertDataFrameEqual(actual, [Row(result)]) + + # Test 11: Using literal values + actual = self.spark.range(1).select( + F.make_timestamp_ntz(F.lit(2024), F.lit(5), F.lit(22), F.lit(10), F.lit(30), F.lit(0)) + ) + assertDataFrameEqual(actual, [Row(result)]) + + # Test 12: Using string column names + actual = df.select(F.make_timestamp_ntz("year", "month", "day", "hour", "minute", "second")) + assertDataFrameEqual(actual, [Row(result)]) + + # Error handling tests + + # Test 13: Mixing timestamp and date/time keyword arguments + with self.assertRaises(PySparkValueError) as context: + df_dt.select( + F.make_timestamp_ntz(years=df.year, date=df_dt.date, time=df_dt.time) + ).collect() + error_msg = str(context.exception) + self.assertIn("CANNOT_SET_TOGETHER", error_msg) + self.assertIn("years|months|days|hours|mins|secs and date|time", error_msg) + + # Test 14: Incomplete keyword arguments - should raise Exception for None values + with self.assertRaises(Exception): + F.make_timestamp_ntz(years=df.year, months=df.month, days=df.day) + + # Test 15: Only one keyword argument - should raise Exception for None values + with self.assertRaises(Exception): + F.make_timestamp_ntz(years=df.year) + + # Test 16: Only date without time - should raise Exception for None values + with self.assertRaises(Exception): + F.make_timestamp_ntz(date=df_dt.date) + + # Test 17: Invalid data types - should raise exception for invalid string to int cast + with self.assertRaises(Exception): + self.spark.range(1).select( + F.make_timestamp_ntz( + F.lit("invalid"), F.lit(5), F.lit(22), F.lit(10), F.lit(30), F.lit(0) + ) + ).collect() + + # Test 18: Out of range values (month=13) - should raise exception for invalid date + df_invalid = self.spark.createDataFrame( + [(2024, 13, 22, 10, 30, 0)], ["year", "month", "day", "hour", "minute", "second"] + ) + with self.assertRaises(Exception): + df_invalid.select( + F.make_timestamp_ntz( + df_invalid.year, + df_invalid.month, + df_invalid.day, + df_invalid.hour, + df_invalid.minute, + df_invalid.second, + ) + ).collect() + + # Test 19: Out of range values (hour=25) - should raise exception for invalid time + df_invalid_hour = self.spark.createDataFrame( + [(2024, 5, 22, 25, 30, 0)], ["year", "month", "day", "hour", "minute", "second"] + ) + with self.assertRaises(Exception): + df_invalid_hour.select( + F.make_timestamp_ntz( + df_invalid_hour.year, + df_invalid_hour.month, + df_invalid_hour.day, + df_invalid_hour.hour, + df_invalid_hour.minute, + df_invalid_hour.second, + ) + ).collect() + + # Test 20: February 29 in non-leap year + df_non_leap = self.spark.createDataFrame( + [(2023, 2, 29, 0, 0, 0)], ["year", "month", "day", "hour", "minute", "second"] + ) + with self.assertRaises(Exception): # Should raise runtime exception for invalid date + df_non_leap.select( + F.make_timestamp_ntz( + df_non_leap.year, + df_non_leap.month, + df_non_leap.day, + df_non_leap.hour, + df_non_leap.minute, + df_non_leap.second, + ) + ).collect() + def test_make_date(self): # SPARK-36554: expose make_date expression df = self.spark.createDataFrame([(2020, 6, 26)], ["Y", "M", "D"])