diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala index 992a2b12a462f..5de06af6af12a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala @@ -39,6 +39,18 @@ trait DateTimeFormatterHelper { } } + private def verifyLocalDate( + accessor: TemporalAccessor, field: ChronoField, candidate: LocalDate): Unit = { + if (accessor.isSupported(field)) { + val actual = accessor.get(field) + val expected = candidate.get(field) + if (actual != expected) { + throw new DateTimeException(s"Conflict found: Field $field $actual differs from" + + s" $field $expected derived from $candidate") + } + } + } + protected def toLocalDate(accessor: TemporalAccessor): LocalDate = { val localDate = accessor.query(TemporalQueries.localDate()) // If all the date fields are specified, return the local date directly. @@ -48,9 +60,17 @@ trait DateTimeFormatterHelper { // later, and we should provide default values for missing fields. // To be compatible with Spark 2.4, we pick 1970 as the default value of year. val year = getOrDefault(accessor, ChronoField.YEAR, 1970) - val month = getOrDefault(accessor, ChronoField.MONTH_OF_YEAR, 1) - val day = getOrDefault(accessor, ChronoField.DAY_OF_MONTH, 1) - LocalDate.of(year, month, day) + if (accessor.isSupported(ChronoField.DAY_OF_YEAR)) { + val dayOfYear = accessor.get(ChronoField.DAY_OF_YEAR) + val date = LocalDate.ofYearDay(year, dayOfYear) + verifyLocalDate(accessor, ChronoField.MONTH_OF_YEAR, date) + verifyLocalDate(accessor, ChronoField.DAY_OF_MONTH, date) + date + } else { + val month = getOrDefault(accessor, ChronoField.MONTH_OF_YEAR, 1) + val day = getOrDefault(accessor, ChronoField.DAY_OF_MONTH, 1) + LocalDate.of(year, month, day) + } } private def toLocalTime(accessor: TemporalAccessor): LocalTime = { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateFormatterSuite.scala index 4892deae92b3d..0a29d94dd984d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateFormatterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateFormatterSuite.scala @@ -31,6 +31,8 @@ class DateFormatterSuite extends DatetimeFormatterSuite { DateFormatter(pattern, UTC, isParsing) } + override protected def useDateFormatter: Boolean = true + test("parsing dates") { outstandingTimezonesIds.foreach { timeZone => withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DatetimeFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DatetimeFormatterSuite.scala index 31ff50fda1ad6..b78facd963338 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DatetimeFormatterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DatetimeFormatterSuite.scala @@ -17,15 +17,61 @@ package org.apache.spark.sql.catalyst.util +import java.time.DateTimeException + import org.scalatest.Matchers import org.apache.spark.{SparkFunSuite, SparkUpgradeException} import org.apache.spark.sql.catalyst.plans.SQLHelper +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{date, UTC} trait DatetimeFormatterSuite extends SparkFunSuite with SQLHelper with Matchers { import DateTimeFormatterHelper._ + import LegacyDateFormats._ def checkFormatterCreation(pattern: String, isParsing: Boolean): Unit + private def dateFormatter( + pattern: String, ldf: LegacyDateFormat = FAST_DATE_FORMAT): DateFormatter = { + DateFormatter(pattern, UTC, DateFormatter.defaultLocale, ldf, isParsing = true) + } + + private def timestampFormatter( + pattern: String, ldf: LegacyDateFormat = SIMPLE_DATE_FORMAT): TimestampFormatter = { + TimestampFormatter(pattern, UTC, legacyFormat = ldf, isParsing = true) + } + + protected def useDateFormatter: Boolean + + private def assertEqual(pattern: String, datetimeStr: String, expected: Long): Unit = { + if (useDateFormatter) { + assert(dateFormatter(pattern).parse(datetimeStr) === + DateTimeUtils.microsToEpochDays(expected, UTC)) + } else { + assert(timestampFormatter(pattern).parse(datetimeStr) === expected) + } + } + + private def assertError(pattern: String, datetimeStr: String, expectedMsg: String): Unit = { + if (useDateFormatter) { + LegacyDateFormats.values.foreach { ldf => + // The legacy DateFormatter is always lenient by default + val e = intercept[SparkUpgradeException](dateFormatter(pattern, ldf).parse(datetimeStr)) + assert(e.getCause.getMessage.contains(expectedMsg)) + } + } else { + // In strict mode, the legacy TimestampFormatter fails too + val e = intercept[DateTimeException](timestampFormatter(pattern).parse(datetimeStr)) + assert(e.getMessage.contains(expectedMsg)) + // In lenient mode, the legacy TimestampFormatter does not fail + Seq(FAST_DATE_FORMAT, LENIENT_SIMPLE_DATE_FORMAT).foreach { ldf => + val e = intercept[SparkUpgradeException] { + timestampFormatter(pattern, ldf).parse(datetimeStr) + } + assert(e.getCause.getMessage.contains(expectedMsg)) + } + } + } + test("explicitly forbidden datetime patterns") { Seq(true, false).foreach { isParsing => @@ -51,4 +97,36 @@ trait DatetimeFormatterSuite extends SparkFunSuite with SQLHelper with Matchers pattern => intercept[SparkUpgradeException](checkFormatterCreation(pattern, true)) } } + + test("SPARK-31939: Fix Parsing day of year when year field pattern is missing") { + // resolved to queryable LocaleDate or fail directly + assertEqual("yyyy-dd-DD", "2020-29-60", date(2020, 2, 29)) + assertError("yyyy-dd-DD", "2020-02-60", + "Field DayOfMonth 29 differs from DayOfMonth 2 derived from 2020-02-29") + assertEqual("yyyy-MM-DD", "2020-02-60", date(2020, 2, 29)) + assertError("yyyy-MM-DD", "2020-03-60", + "Field MonthOfYear 2 differs from MonthOfYear 3 derived from 2020-02-29") + assertEqual("yyyy-MM-dd-DD", "2020-02-29-60", date(2020, 2, 29)) + assertError("yyyy-MM-dd-DD", "2020-03-01-60", + "Field DayOfYear 61 differs from DayOfYear 60 derived from 2020-03-01") + assertEqual("yyyy-DDD", "2020-366", date(2020, 12, 31)) + assertError("yyyy-DDD", "2019-366", + "Invalid date 'DayOfYear 366' as '2019' is not a leap year") + + // unresolved and need to check manually(SPARK-31939 fixed) + assertEqual("DDD", "365", date(1970, 12, 31)) + assertError("DDD", "366", + "Invalid date 'DayOfYear 366' as '1970' is not a leap year") + assertEqual("MM-DD", "03-60", date(1970, 3)) + assertError("MM-DD", "02-60", + "Field MonthOfYear 2 differs from MonthOfYear 3 derived from 1970-03-01") + assertEqual("MM-dd-DD", "02-28-59", date(1970, 2, 28)) + assertError("MM-dd-DD", "02-28-60", + "Field MonthOfYear 2 differs from MonthOfYear 3 derived from 1970-03-01") + assertError("MM-dd-DD", "02-28-58", + "Field DayOfMonth 28 differs from DayOfMonth 27 derived from 1970-02-27") + assertEqual("dd-DD", "28-59", date(1970, 2, 28)) + assertError("dd-DD", "27-59", + "Field DayOfMonth 27 differs from DayOfMonth 28 derived from 1970-02-28") + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala index e70f805b30f39..51286986b835c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala @@ -33,6 +33,8 @@ class TimestampFormatterSuite extends DatetimeFormatterSuite { TimestampFormatter(pattern, UTC, isParsing) } + override protected def useDateFormatter: Boolean = false + test("parsing timestamps using time zones") { val localDate = "2018-12-02T10:11:12.001234" val expectedMicros = Map( diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-invalid.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-invalid.sql new file mode 100644 index 0000000000000..0b313e5a0b9f4 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-invalid.sql @@ -0,0 +1,20 @@ +--- TESTS FOR DATETIME PARSING FUNCTIONS WITH INVALID VALUES --- + +-- parsing invalid values with pattern 'D' +select to_timestamp('366', 'D'); +select to_timestamp('9', 'DD'); +-- in java 8 this case is invalid, but valid in java 11, disabled for jenkins +-- select to_timestamp('100', 'DD'); +select to_timestamp('366', 'DD'); +select to_timestamp('9', 'DDD'); +select to_timestamp('99', 'DDD'); +select to_timestamp('30-365', 'dd-DDD'); +select to_timestamp('11-365', 'MM-DDD'); +select to_timestamp('2019-366', 'yyyy-DDD'); +select to_timestamp('12-30-365', 'MM-dd-DDD'); +select to_timestamp('2020-01-365', 'yyyy-dd-DDD'); +select to_timestamp('2020-10-350', 'yyyy-MM-DDD'); +select to_timestamp('2020-11-31-366', 'yyyy-MM-dd-DDD'); +-- add a special case to test csv, because the legacy formatter it uses is lenient then Spark should +-- throw SparkUpgradeException +select from_csv('2018-366', 'date Date', map('dateFormat', 'yyyy-DDD')) diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-legacy.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-legacy.sql new file mode 100644 index 0000000000000..ee1afe502ab79 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing-legacy.sql @@ -0,0 +1,2 @@ +--SET spark.sql.legacy.timeParserPolicy=LEGACY +--IMPORT datetime-parsing.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing.sql new file mode 100644 index 0000000000000..74866d9c6ffa1 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime-parsing.sql @@ -0,0 +1,16 @@ +--- TESTS FOR DATETIME PARSING FUNCTIONS --- + +-- parsing with pattern 'D' +select to_timestamp('9', 'D'); +select to_timestamp('300', 'D'); +select to_timestamp('09', 'DD'); +select to_timestamp('99', 'DD'); +select to_timestamp('009', 'DDD'); +select to_timestamp('365', 'DDD'); +select to_timestamp('31-365', 'dd-DDD'); +select to_timestamp('12-365', 'MM-DDD'); +select to_timestamp('2020-365', 'yyyy-DDD'); +select to_timestamp('12-31-365', 'MM-dd-DDD'); +select to_timestamp('2020-30-365', 'yyyy-dd-DDD'); +select to_timestamp('2020-12-350', 'yyyy-MM-DDD'); +select to_timestamp('2020-12-31-366', 'yyyy-MM-dd-DDD'); diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-parsing-invalid.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-parsing-invalid.sql.out new file mode 100644 index 0000000000000..dcee0d4c270a3 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/datetime-parsing-invalid.sql.out @@ -0,0 +1,110 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 13 + + +-- !query +select to_timestamp('366', 'D') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('9', 'DD') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '9' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + +-- !query +select to_timestamp('366', 'DD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('9', 'DDD') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '9' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + +-- !query +select to_timestamp('99', 'DDD') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '99' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. + + +-- !query +select to_timestamp('30-365', 'dd-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('11-365', 'MM-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2019-366', 'yyyy-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('12-30-365', 'MM-dd-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2020-01-365', 'yyyy-dd-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2020-10-350', 'yyyy-MM-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_timestamp('2020-11-31-366', 'yyyy-MM-dd-DDD') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select from_csv('2018-366', 'date Date', map('dateFormat', 'yyyy-DDD')) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +You may get a different result due to the upgrading of Spark 3.0: Fail to parse '2018-366' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string. diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-parsing-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-parsing-legacy.sql.out new file mode 100644 index 0000000000000..742e0839d919a --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/datetime-parsing-legacy.sql.out @@ -0,0 +1,106 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 13 + + +-- !query +select to_timestamp('9', 'D') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('300', 'D') +-- !query schema +struct +-- !query output +1970-10-27 00:00:00 + + +-- !query +select to_timestamp('09', 'DD') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('99', 'DD') +-- !query schema +struct +-- !query output +1970-04-09 00:00:00 + + +-- !query +select to_timestamp('009', 'DDD') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('365', 'DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('31-365', 'dd-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('12-365', 'MM-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('2020-365', 'yyyy-DDD') +-- !query schema +struct +-- !query output +2020-12-30 00:00:00 + + +-- !query +select to_timestamp('12-31-365', 'MM-dd-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('2020-30-365', 'yyyy-dd-DDD') +-- !query schema +struct +-- !query output +2020-12-30 00:00:00 + + +-- !query +select to_timestamp('2020-12-350', 'yyyy-MM-DDD') +-- !query schema +struct +-- !query output +2020-12-15 00:00:00 + + +-- !query +select to_timestamp('2020-12-31-366', 'yyyy-MM-dd-DDD') +-- !query schema +struct +-- !query output +2020-12-31 00:00:00 diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-parsing.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-parsing.sql.out new file mode 100644 index 0000000000000..742e0839d919a --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/datetime-parsing.sql.out @@ -0,0 +1,106 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 13 + + +-- !query +select to_timestamp('9', 'D') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('300', 'D') +-- !query schema +struct +-- !query output +1970-10-27 00:00:00 + + +-- !query +select to_timestamp('09', 'DD') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('99', 'DD') +-- !query schema +struct +-- !query output +1970-04-09 00:00:00 + + +-- !query +select to_timestamp('009', 'DDD') +-- !query schema +struct +-- !query output +1970-01-09 00:00:00 + + +-- !query +select to_timestamp('365', 'DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('31-365', 'dd-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('12-365', 'MM-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('2020-365', 'yyyy-DDD') +-- !query schema +struct +-- !query output +2020-12-30 00:00:00 + + +-- !query +select to_timestamp('12-31-365', 'MM-dd-DDD') +-- !query schema +struct +-- !query output +1970-12-31 00:00:00 + + +-- !query +select to_timestamp('2020-30-365', 'yyyy-dd-DDD') +-- !query schema +struct +-- !query output +2020-12-30 00:00:00 + + +-- !query +select to_timestamp('2020-12-350', 'yyyy-MM-DDD') +-- !query schema +struct +-- !query output +2020-12-15 00:00:00 + + +-- !query +select to_timestamp('2020-12-31-366', 'yyyy-MM-dd-DDD') +-- !query schema +struct +-- !query output +2020-12-31 00:00:00