diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index a5ef1c2b1d04..2cb963fe4cd0 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -67,9 +67,7 @@ license: | - Since Spark 3.0, Proleptic Gregorian calendar is used in parsing, formatting, and converting dates and timestamps as well as in extracting sub-components like years, days and etc. Spark 3.0 uses Java 8 API classes from the java.time packages that based on ISO chronology (https://docs.oracle.com/javase/8/docs/api/java/time/chrono/IsoChronology.html). In Spark version 2.4 and earlier, those operations are performed by using the hybrid calendar (Julian + Gregorian, see https://docs.oracle.com/javase/7/docs/api/java/util/GregorianCalendar.html). The changes impact on the results for dates before October 15, 1582 (Gregorian) and affect on the following Spark 3.0 API: - - CSV/JSON datasources use java.time API for parsing and generating CSV/JSON content. In Spark version 2.4 and earlier, java.text.SimpleDateFormat is used for the same purpose with fallbacks to the parsing mechanisms of Spark 2.0 and 1.x. For example, `2018-12-08 10:39:21.123` with the pattern `yyyy-MM-dd'T'HH:mm:ss.SSS` cannot be parsed since Spark 3.0 because the timestamp does not match to the pattern but it can be parsed by earlier Spark versions due to a fallback to `Timestamp.valueOf`. To parse the same timestamp since Spark 3.0, the pattern should be `yyyy-MM-dd HH:mm:ss.SSS`. - - - The `unix_timestamp`, `date_format`, `to_unix_timestamp`, `from_unixtime`, `to_date`, `to_timestamp` functions. New implementation supports pattern formats as described here https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html and performs strict checking of its input. For example, the `2015-07-22 10:00:00` timestamp cannot be parse if pattern is `yyyy-MM-dd` because the parser does not consume whole input. Another example is the `31/01/2015 00:00` input cannot be parsed by the `dd/MM/yyyy hh:mm` pattern because `hh` supposes hours in the range `1-12`. + - Parsing/formatting of timestamp/date strings. This effects on CSV/JSON datasources and on the `unix_timestamp`, `date_format`, `to_unix_timestamp`, `from_unixtime`, `to_date`, `to_timestamp` functions when patterns specified by users is used for parsing and formatting. Since Spark 3.0, the conversions are based on `java.time.format.DateTimeFormatter`, see https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html. New implementation performs strict checking of its input. For example, the `2015-07-22 10:00:00` timestamp cannot be parse if pattern is `yyyy-MM-dd` because the parser does not consume whole input. Another example is the `31/01/2015 00:00` input cannot be parsed by the `dd/MM/yyyy hh:mm` pattern because `hh` supposes hours in the range `1-12`. In Spark version 2.4 and earlier, `java.text.SimpleDateFormat` is used for timestamp/date string conversions, and the supported patterns are described in https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html. The old behavior can be restored by setting `spark.sql.legacy.timeParser.enabled` to `true`. - The `weekofyear`, `weekday`, `dayofweek`, `date_trunc`, `from_utc_timestamp`, `to_utc_timestamp`, and `unix_timestamp` functions use java.time API for calculation week number of year, day number of week as well for conversion from/to TimestampType values in UTC time zone. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala index 7f982b019c8d..28189b65dee9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala @@ -20,7 +20,10 @@ package org.apache.spark.sql.catalyst.util import java.time.{LocalDate, ZoneId} import java.util.Locale -import DateTimeUtils.{convertSpecialDate, localDateToDays} +import org.apache.commons.lang3.time.FastDateFormat + +import org.apache.spark.sql.catalyst.util.DateTimeUtils.{convertSpecialDate, localDateToDays} +import org.apache.spark.sql.internal.SQLConf sealed trait DateFormatter extends Serializable { def parse(s: String): Int // returns days since epoch @@ -48,17 +51,41 @@ class Iso8601DateFormatter( } } +class LegacyDateFormatter(pattern: String, locale: Locale) extends DateFormatter { + @transient + private lazy val format = FastDateFormat.getInstance(pattern, locale) + + override def parse(s: String): Int = { + val milliseconds = format.parse(s).getTime + DateTimeUtils.millisToDays(milliseconds) + } + + override def format(days: Int): String = { + val date = DateTimeUtils.toJavaDate(days) + format.format(date) + } +} + object DateFormatter { - val defaultPattern: String = "uuuu-MM-dd" val defaultLocale: Locale = Locale.US def apply(format: String, zoneId: ZoneId, locale: Locale): DateFormatter = { - new Iso8601DateFormatter(format, zoneId, locale) + if (SQLConf.get.legacyTimeParserEnabled) { + new LegacyDateFormatter(format, locale) + } else { + new Iso8601DateFormatter(format, zoneId, locale) + } } def apply(format: String, zoneId: ZoneId): DateFormatter = { apply(format, zoneId, defaultLocale) } - def apply(zoneId: ZoneId): DateFormatter = apply(defaultPattern, zoneId) + def apply(zoneId: ZoneId): DateFormatter = { + if (SQLConf.get.legacyTimeParserEnabled) { + new LegacyDateFormatter("yyyy-MM-dd", defaultLocale) + } else { + new Iso8601DateFormatter("uuuu-MM-dd", zoneId, defaultLocale) + } + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala index 5be4807083fa..fe1a4fe710c2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala @@ -22,10 +22,14 @@ import java.time._ import java.time.format.DateTimeParseException import java.time.temporal.ChronoField.MICRO_OF_SECOND import java.time.temporal.TemporalQueries -import java.util.Locale +import java.util.{Locale, TimeZone} import java.util.concurrent.TimeUnit.SECONDS -import DateTimeUtils.convertSpecialTimestamp +import org.apache.commons.lang3.time.FastDateFormat + +import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_MILLIS +import org.apache.spark.sql.catalyst.util.DateTimeUtils.convertSpecialTimestamp +import org.apache.spark.sql.internal.SQLConf sealed trait TimestampFormatter extends Serializable { /** @@ -86,12 +90,32 @@ class FractionTimestampFormatter(zoneId: ZoneId) override protected lazy val formatter = DateTimeFormatterHelper.fractionFormatter } +class LegacyTimestampFormatter( + pattern: String, + zoneId: ZoneId, + locale: Locale) extends TimestampFormatter { + + @transient private lazy val format = + FastDateFormat.getInstance(pattern, TimeZone.getTimeZone(zoneId), locale) + + protected def toMillis(s: String): Long = format.parse(s).getTime + + override def parse(s: String): Long = toMillis(s) * MICROS_PER_MILLIS + + override def format(us: Long): String = { + format.format(DateTimeUtils.toJavaTimestamp(us)) + } +} + object TimestampFormatter { - val defaultPattern: String = "uuuu-MM-dd HH:mm:ss" val defaultLocale: Locale = Locale.US def apply(format: String, zoneId: ZoneId, locale: Locale): TimestampFormatter = { - new Iso8601TimestampFormatter(format, zoneId, locale) + if (SQLConf.get.legacyTimeParserEnabled) { + new LegacyTimestampFormatter(format, zoneId, locale) + } else { + new Iso8601TimestampFormatter(format, zoneId, locale) + } } def apply(format: String, zoneId: ZoneId): TimestampFormatter = { @@ -99,7 +123,11 @@ object TimestampFormatter { } def apply(zoneId: ZoneId): TimestampFormatter = { - apply(defaultPattern, zoneId, defaultLocale) + if (SQLConf.get.legacyTimeParserEnabled) { + new LegacyTimestampFormatter("yyyy-MM-dd HH:mm:ss", zoneId, defaultLocale) + } else { + new Iso8601TimestampFormatter("uuuu-MM-dd HH:mm:ss", zoneId, defaultLocale) + } } def getFractionFormatter(zoneId: ZoneId): TimestampFormatter = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 3ad3416256c7..04fdb0d261ea 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2146,6 +2146,14 @@ object SQLConf { .checkValue(_ > 0, "The value of spark.sql.addPartitionInBatch.size must be positive") .createWithDefault(100) + val LEGACY_TIME_PARSER_ENABLED = buildConf("spark.sql.legacy.timeParser.enabled") + .internal() + .doc("When set to true, java.text.SimpleDateFormat is used for formatting and parsing " + + "dates/timestamps in a locale-sensitive manner. When set to false, classes from " + + "java.time.* packages are used for the same purpose.") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * @@ -2434,6 +2442,8 @@ class SQLConf extends Serializable with Logging { def legacyMsSqlServerNumericMappingEnabled: Boolean = getConf(LEGACY_MSSQLSERVER_NUMERIC_MAPPING_ENABLED) + def legacyTimeParserEnabled: Boolean = getConf(SQLConf.LEGACY_TIME_PARSER_ENABLED) + /** * Returns the [[Resolver]] for the current configuration, which can be used to determine if two * identifiers are equal. diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala index a48e61861c15..c2e03bd2c360 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala @@ -17,10 +17,9 @@ package org.apache.spark.sql.catalyst.json -import com.fasterxml.jackson.core.JsonFactory - import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.plans.SQLHelper +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper { @@ -41,45 +40,61 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper { } test("inferring timestamp type") { - checkTimestampType("yyyy", """{"a": "2018"}""") - checkTimestampType("yyyy=MM", """{"a": "2018=12"}""") - checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""") - checkTimestampType( - "yyyy-MM-dd'T'HH:mm:ss.SSS", - """{"a": "2018-12-02T21:04:00.123"}""") - checkTimestampType( - "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX", - """{"a": "2018-12-02T21:04:00.123567+01:00"}""") + Seq(true, false).foreach { legacyParser => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) { + checkTimestampType("yyyy", """{"a": "2018"}""") + checkTimestampType("yyyy=MM", """{"a": "2018=12"}""") + checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""") + checkTimestampType( + "yyyy-MM-dd'T'HH:mm:ss.SSS", + """{"a": "2018-12-02T21:04:00.123"}""") + checkTimestampType( + "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX", + """{"a": "2018-12-02T21:04:00.123567+01:00"}""") + } + } } test("prefer decimals over timestamps") { - checkType( - options = Map( - "prefersDecimal" -> "true", - "timestampFormat" -> "yyyyMMdd.HHmmssSSS" - ), - json = """{"a": "20181202.210400123"}""", - dt = DecimalType(17, 9) - ) + Seq(true, false).foreach { legacyParser => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) { + checkType( + options = Map( + "prefersDecimal" -> "true", + "timestampFormat" -> "yyyyMMdd.HHmmssSSS" + ), + json = """{"a": "20181202.210400123"}""", + dt = DecimalType(17, 9) + ) + } + } } test("skip decimal type inferring") { - checkType( - options = Map( - "prefersDecimal" -> "false", - "timestampFormat" -> "yyyyMMdd.HHmmssSSS" - ), - json = """{"a": "20181202.210400123"}""", - dt = TimestampType - ) + Seq(true, false).foreach { legacyParser => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) { + checkType( + options = Map( + "prefersDecimal" -> "false", + "timestampFormat" -> "yyyyMMdd.HHmmssSSS" + ), + json = """{"a": "20181202.210400123"}""", + dt = TimestampType + ) + } + } } test("fallback to string type") { - checkType( - options = Map("timestampFormat" -> "yyyy,MM,dd.HHmmssSSS"), - json = """{"a": "20181202.210400123"}""", - dt = StringType - ) + Seq(true, false).foreach { legacyParser => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) { + checkType( + options = Map("timestampFormat" -> "yyyy,MM,dd.HHmmssSSS"), + json = """{"a": "20181202.210400123"}""", + dt = StringType + ) + } + } } test("disable timestamp inferring") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index d7d8c2c52d12..3b3d3cc3d7a1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -789,4 +789,18 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { Row(Timestamp.valueOf("2015-07-24 07:00:00")), Row(Timestamp.valueOf("2015-07-24 22:00:00")))) } + + test("SPARK-30668: use legacy timestamp parser in to_timestamp") { + def checkTimeZoneParsing(expected: Any): Unit = { + val df = Seq("2020-01-27T20:06:11.847-0800").toDF("ts") + checkAnswer(df.select(to_timestamp(col("ts"), "yyyy-MM-dd'T'HH:mm:ss.SSSz")), + Row(expected)) + } + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "true") { + checkTimeZoneParsing(Timestamp.valueOf("2020-01-27 20:06:11.847")) + } + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "false") { + checkTimeZoneParsing(null) + } + } }