Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@ trait DateTimeFormatterHelper {
}
}

private def verifyLocalDate(
accessor: TemporalAccessor, field: ChronoField, candidate: LocalDate): Unit = {
if (accessor.isSupported(field)) {
val actual = accessor.get(field)
val expected = candidate.get(field)
if (actual != expected) {
throw new DateTimeException(s"Conflict found: Field $field $actual differs from" +
s" $field $expected derived from $candidate")
}
}
}

protected def toLocalDate(accessor: TemporalAccessor): LocalDate = {
val localDate = accessor.query(TemporalQueries.localDate())
// If all the date fields are specified, return the local date directly.
Expand All @@ -48,9 +60,17 @@ trait DateTimeFormatterHelper {
// later, and we should provide default values for missing fields.
// To be compatible with Spark 2.4, we pick 1970 as the default value of year.
val year = getOrDefault(accessor, ChronoField.YEAR, 1970)
val month = getOrDefault(accessor, ChronoField.MONTH_OF_YEAR, 1)
val day = getOrDefault(accessor, ChronoField.DAY_OF_MONTH, 1)
LocalDate.of(year, month, day)
if (accessor.isSupported(ChronoField.DAY_OF_YEAR)) {
val dayOfYear = accessor.get(ChronoField.DAY_OF_YEAR)
val date = LocalDate.ofYearDay(year, dayOfYear)
verifyLocalDate(accessor, ChronoField.MONTH_OF_YEAR, date)
verifyLocalDate(accessor, ChronoField.DAY_OF_MONTH, date)
date
} else {
val month = getOrDefault(accessor, ChronoField.MONTH_OF_YEAR, 1)
val day = getOrDefault(accessor, ChronoField.DAY_OF_MONTH, 1)
LocalDate.of(year, month, day)
}
}

private def toLocalTime(accessor: TemporalAccessor): LocalTime = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class DateFormatterSuite extends DatetimeFormatterSuite {
DateFormatter(pattern, UTC, isParsing)
}

override protected def useDateFormatter: Boolean = true

test("parsing dates") {
outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,61 @@

package org.apache.spark.sql.catalyst.util

import java.time.DateTimeException

import org.scalatest.Matchers

import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
import org.apache.spark.sql.catalyst.plans.SQLHelper
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{date, UTC}

trait DatetimeFormatterSuite extends SparkFunSuite with SQLHelper with Matchers {
import DateTimeFormatterHelper._
import LegacyDateFormats._
def checkFormatterCreation(pattern: String, isParsing: Boolean): Unit

private def dateFormatter(
pattern: String, ldf: LegacyDateFormat = FAST_DATE_FORMAT): DateFormatter = {
DateFormatter(pattern, UTC, DateFormatter.defaultLocale, ldf, isParsing = true)
}

private def timestampFormatter(
pattern: String, ldf: LegacyDateFormat = SIMPLE_DATE_FORMAT): TimestampFormatter = {
TimestampFormatter(pattern, UTC, legacyFormat = ldf, isParsing = true)
}

protected def useDateFormatter: Boolean

private def assertEqual(pattern: String, datetimeStr: String, expected: Long): Unit = {
if (useDateFormatter) {
assert(dateFormatter(pattern).parse(datetimeStr) ===
DateTimeUtils.microsToEpochDays(expected, UTC))
} else {
assert(timestampFormatter(pattern).parse(datetimeStr) === expected)
}
}

private def assertError(pattern: String, datetimeStr: String, expectedMsg: String): Unit = {
if (useDateFormatter) {
LegacyDateFormats.values.foreach { ldf =>
// The legacy DateFormatter is always lenient by default
val e = intercept[SparkUpgradeException](dateFormatter(pattern, ldf).parse(datetimeStr))
assert(e.getCause.getMessage.contains(expectedMsg))
}
} else {
// In strict mode, the legacy TimestampFormatter fails too
val e = intercept[DateTimeException](timestampFormatter(pattern).parse(datetimeStr))
assert(e.getMessage.contains(expectedMsg))
// In lenient mode, the legacy TimestampFormatter does not fail
Seq(FAST_DATE_FORMAT, LENIENT_SIMPLE_DATE_FORMAT).foreach { ldf =>
val e = intercept[SparkUpgradeException] {
timestampFormatter(pattern, ldf).parse(datetimeStr)
}
assert(e.getCause.getMessage.contains(expectedMsg))
}
}
}

test("explicitly forbidden datetime patterns") {

Seq(true, false).foreach { isParsing =>
Expand All @@ -51,4 +97,36 @@ trait DatetimeFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
pattern => intercept[SparkUpgradeException](checkFormatterCreation(pattern, true))
}
}

test("SPARK-31939: Fix Parsing day of year when year field pattern is missing") {
// resolved to queryable LocaleDate or fail directly
assertEqual("yyyy-dd-DD", "2020-29-60", date(2020, 2, 29))
assertError("yyyy-dd-DD", "2020-02-60",
"Field DayOfMonth 29 differs from DayOfMonth 2 derived from 2020-02-29")
assertEqual("yyyy-MM-DD", "2020-02-60", date(2020, 2, 29))
assertError("yyyy-MM-DD", "2020-03-60",
"Field MonthOfYear 2 differs from MonthOfYear 3 derived from 2020-02-29")
assertEqual("yyyy-MM-dd-DD", "2020-02-29-60", date(2020, 2, 29))
assertError("yyyy-MM-dd-DD", "2020-03-01-60",
"Field DayOfYear 61 differs from DayOfYear 60 derived from 2020-03-01")
assertEqual("yyyy-DDD", "2020-366", date(2020, 12, 31))
assertError("yyyy-DDD", "2019-366",
"Invalid date 'DayOfYear 366' as '2019' is not a leap year")

// unresolved and need to check manually(SPARK-31939 fixed)
assertEqual("DDD", "365", date(1970, 12, 31))
assertError("DDD", "366",
"Invalid date 'DayOfYear 366' as '1970' is not a leap year")
assertEqual("MM-DD", "03-60", date(1970, 3))
assertError("MM-DD", "02-60",
"Field MonthOfYear 2 differs from MonthOfYear 3 derived from 1970-03-01")
assertEqual("MM-dd-DD", "02-28-59", date(1970, 2, 28))
assertError("MM-dd-DD", "02-28-60",
"Field MonthOfYear 2 differs from MonthOfYear 3 derived from 1970-03-01")
assertError("MM-dd-DD", "02-28-58",
"Field DayOfMonth 28 differs from DayOfMonth 27 derived from 1970-02-27")
assertEqual("dd-DD", "28-59", date(1970, 2, 28))
assertError("dd-DD", "27-59",
"Field DayOfMonth 27 differs from DayOfMonth 28 derived from 1970-02-28")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ class TimestampFormatterSuite extends DatetimeFormatterSuite {
TimestampFormatter(pattern, UTC, isParsing)
}

override protected def useDateFormatter: Boolean = false

test("parsing timestamps using time zones") {
val localDate = "2018-12-02T10:11:12.001234"
val expectedMicros = Map(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
--- TESTS FOR DATETIME PARSING FUNCTIONS WITH INVALID VALUES ---

-- parsing invalid values with pattern 'D'
select to_timestamp('366', 'D');
select to_timestamp('9', 'DD');
-- in java 8 this case is invalid, but valid in java 11, disabled for jenkins
-- select to_timestamp('100', 'DD');
select to_timestamp('366', 'DD');
select to_timestamp('9', 'DDD');
select to_timestamp('99', 'DDD');
select to_timestamp('30-365', 'dd-DDD');
select to_timestamp('11-365', 'MM-DDD');
select to_timestamp('2019-366', 'yyyy-DDD');
select to_timestamp('12-30-365', 'MM-dd-DDD');
select to_timestamp('2020-01-365', 'yyyy-dd-DDD');
select to_timestamp('2020-10-350', 'yyyy-MM-DDD');
select to_timestamp('2020-11-31-366', 'yyyy-MM-dd-DDD');
-- add a special case to test csv, because the legacy formatter it uses is lenient then Spark should
-- throw SparkUpgradeException
select from_csv('2018-366', 'date Date', map('dateFormat', 'yyyy-DDD'))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add a comment to explain why we need to test csv: because it's lenient and Spark should throw upgrade exception.

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
--SET spark.sql.legacy.timeParserPolicy=LEGACY
--IMPORT datetime-parsing.sql
16 changes: 16 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/datetime-parsing.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
--- TESTS FOR DATETIME PARSING FUNCTIONS ---
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there are a lot more tests should be added here later :)


-- parsing with pattern 'D'
select to_timestamp('9', 'D');
select to_timestamp('300', 'D');
select to_timestamp('09', 'DD');
select to_timestamp('99', 'DD');
select to_timestamp('009', 'DDD');
select to_timestamp('365', 'DDD');
select to_timestamp('31-365', 'dd-DDD');
select to_timestamp('12-365', 'MM-DDD');
select to_timestamp('2020-365', 'yyyy-DDD');
select to_timestamp('12-31-365', 'MM-dd-DDD');
select to_timestamp('2020-30-365', 'yyyy-dd-DDD');
select to_timestamp('2020-12-350', 'yyyy-MM-DDD');
select to_timestamp('2020-12-31-366', 'yyyy-MM-dd-DDD');
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 13


-- !query
select to_timestamp('366', 'D')
-- !query schema
struct<to_timestamp(366, D):timestamp>
-- !query output
NULL


-- !query
select to_timestamp('9', 'DD')
-- !query schema
struct<>
-- !query output
org.apache.spark.SparkUpgradeException
You may get a different result due to the upgrading of Spark 3.0: Fail to parse '9' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.


-- !query
select to_timestamp('366', 'DD')
-- !query schema
struct<to_timestamp(366, DD):timestamp>
-- !query output
NULL


-- !query
select to_timestamp('9', 'DDD')
-- !query schema
struct<>
-- !query output
org.apache.spark.SparkUpgradeException
You may get a different result due to the upgrading of Spark 3.0: Fail to parse '9' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.


-- !query
select to_timestamp('99', 'DDD')
-- !query schema
struct<>
-- !query output
org.apache.spark.SparkUpgradeException
You may get a different result due to the upgrading of Spark 3.0: Fail to parse '99' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.


-- !query
select to_timestamp('30-365', 'dd-DDD')
-- !query schema
struct<to_timestamp(30-365, dd-DDD):timestamp>
-- !query output
NULL


-- !query
select to_timestamp('11-365', 'MM-DDD')
-- !query schema
struct<to_timestamp(11-365, MM-DDD):timestamp>
-- !query output
NULL


-- !query
select to_timestamp('2019-366', 'yyyy-DDD')
-- !query schema
struct<to_timestamp(2019-366, yyyy-DDD):timestamp>
-- !query output
NULL


-- !query
select to_timestamp('12-30-365', 'MM-dd-DDD')
-- !query schema
struct<to_timestamp(12-30-365, MM-dd-DDD):timestamp>
-- !query output
NULL


-- !query
select to_timestamp('2020-01-365', 'yyyy-dd-DDD')
-- !query schema
struct<to_timestamp(2020-01-365, yyyy-dd-DDD):timestamp>
-- !query output
NULL


-- !query
select to_timestamp('2020-10-350', 'yyyy-MM-DDD')
-- !query schema
struct<to_timestamp(2020-10-350, yyyy-MM-DDD):timestamp>
-- !query output
NULL


-- !query
select to_timestamp('2020-11-31-366', 'yyyy-MM-dd-DDD')
-- !query schema
struct<to_timestamp(2020-11-31-366, yyyy-MM-dd-DDD):timestamp>
-- !query output
NULL


-- !query
select from_csv('2018-366', 'date Date', map('dateFormat', 'yyyy-DDD'))
-- !query schema
struct<>
-- !query output
org.apache.spark.SparkUpgradeException
You may get a different result due to the upgrading of Spark 3.0: Fail to parse '2018-366' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.
Loading