apache · MaxGekk · Feb 3, 2020 · Feb 3, 2020 · Feb 3, 2020 · Feb 3, 2020
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -67,9 +67,7 @@ license: |
 
   - Since Spark 3.0, Proleptic Gregorian calendar is used in parsing, formatting, and converting dates and timestamps as well as in extracting sub-components like years, days and etc. Spark 3.0 uses Java 8 API classes from the java.time packages that based on ISO chronology (https://docs.oracle.com/javase/8/docs/api/java/time/chrono/IsoChronology.html). In Spark version 2.4 and earlier, those operations are performed by using the hybrid calendar (Julian + Gregorian, see https://docs.oracle.com/javase/7/docs/api/java/util/GregorianCalendar.html). The changes impact on the results for dates before October 15, 1582 (Gregorian) and affect on the following Spark 3.0 API:
 
-    - CSV/JSON datasources use java.time API for parsing and generating CSV/JSON content. In Spark version 2.4 and earlier, java.text.SimpleDateFormat is used for the same purpose with fallbacks to the parsing mechanisms of Spark 2.0 and 1.x. For example, `2018-12-08 10:39:21.123` with the pattern `yyyy-MM-dd'T'HH:mm:ss.SSS` cannot be parsed since Spark 3.0 because the timestamp does not match to the pattern but it can be parsed by earlier Spark versions due to a fallback to `Timestamp.valueOf`. To parse the same timestamp since Spark 3.0, the pattern should be `yyyy-MM-dd HH:mm:ss.SSS`.
-
-    - The `unix_timestamp`, `date_format`, `to_unix_timestamp`, `from_unixtime`, `to_date`, `to_timestamp` functions. New implementation supports pattern formats as described here https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html and performs strict checking of its input. For example, the `2015-07-22 10:00:00` timestamp cannot be parse if pattern is `yyyy-MM-dd` because the parser does not consume whole input. Another example is the `31/01/2015 00:00` input cannot be parsed by the `dd/MM/yyyy hh:mm` pattern because `hh` supposes hours in the range `1-12`.
+    - Parsing/formatting of timestamp/date strings. This effects on CSV/JSON datasources and on the `unix_timestamp`, `date_format`, `to_unix_timestamp`, `from_unixtime`, `to_date`, `to_timestamp` functions when patterns specified by users is used for parsing and formatting. Since Spark 3.0, the conversions are based on `java.time.format.DateTimeFormatter`, see https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html. New implementation performs strict checking of its input. For example, the `2015-07-22 10:00:00` timestamp cannot be parse if pattern is `yyyy-MM-dd` because the parser does not consume whole input. Another example is the `31/01/2015 00:00` input cannot be parsed by the `dd/MM/yyyy hh:mm` pattern because `hh` supposes hours in the range `1-12`. In Spark version 2.4 and earlier, `java.text.SimpleDateFormat` is used for timestamp/date string conversions, and the supported patterns are described in https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html. The old behavior can be restored by setting `spark.sql.legacy.timeParser.enabled` to `true`.
 
     - The `weekofyear`, `weekday`, `dayofweek`, `date_trunc`, `from_utc_timestamp`, `to_utc_timestamp`, and `unix_timestamp` functions use java.time API for calculation week number of year, day number of week as well for conversion from/to TimestampType values in UTC time zone.
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
@@ -20,7 +20,10 @@ package org.apache.spark.sql.catalyst.util
 import java.time.{LocalDate, ZoneId}
 import java.util.Locale
 
-import DateTimeUtils.{convertSpecialDate, localDateToDays}
+import org.apache.commons.lang3.time.FastDateFormat
+
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.{convertSpecialDate, localDateToDays}
+import org.apache.spark.sql.internal.SQLConf
 
 sealed trait DateFormatter extends Serializable {
   def parse(s: String): Int // returns days since epoch
@@ -48,17 +51,41 @@ class Iso8601DateFormatter(
   }
 }
 
+class LegacyDateFormatter(pattern: String, locale: Locale) extends DateFormatter {
+  @transient
+  private lazy val format = FastDateFormat.getInstance(pattern, locale)
+
+  override def parse(s: String): Int = {
+    val milliseconds = format.parse(s).getTime
+    DateTimeUtils.millisToDays(milliseconds)
+  }
+
+  override def format(days: Int): String = {
+    val date = DateTimeUtils.toJavaDate(days)
+    format.format(date)
+  }
+}
+
 object DateFormatter {
-  val defaultPattern: String = "uuuu-MM-dd"
   val defaultLocale: Locale = Locale.US
 
   def apply(format: String, zoneId: ZoneId, locale: Locale): DateFormatter = {
-    new Iso8601DateFormatter(format, zoneId, locale)
+    if (SQLConf.get.legacyTimeParserEnabled) {
+      new LegacyDateFormatter(format, locale)
+    } else {
+      new Iso8601DateFormatter(format, zoneId, locale)
+    }
   }
 
   def apply(format: String, zoneId: ZoneId): DateFormatter = {
     apply(format, zoneId, defaultLocale)
   }
 
-  def apply(zoneId: ZoneId): DateFormatter = apply(defaultPattern, zoneId)
+  def apply(zoneId: ZoneId): DateFormatter = {
+    if (SQLConf.get.legacyTimeParserEnabled) {
+      new LegacyDateFormatter("yyyy-MM-dd", defaultLocale)
+    } else {
+      new Iso8601DateFormatter("uuuu-MM-dd", zoneId, defaultLocale)
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -22,10 +22,14 @@ import java.time._
 import java.time.format.DateTimeParseException
 import java.time.temporal.ChronoField.MICRO_OF_SECOND
 import java.time.temporal.TemporalQueries
-import java.util.Locale
+import java.util.{Locale, TimeZone}
 import java.util.concurrent.TimeUnit.SECONDS
 
-import DateTimeUtils.convertSpecialTimestamp
+import org.apache.commons.lang3.time.FastDateFormat
+
+import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_MILLIS
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.convertSpecialTimestamp
+import org.apache.spark.sql.internal.SQLConf
 
 sealed trait TimestampFormatter extends Serializable {
   /**
@@ -86,20 +90,44 @@ class FractionTimestampFormatter(zoneId: ZoneId)
   override protected lazy val formatter = DateTimeFormatterHelper.fractionFormatter
 }
 
+class LegacyTimestampFormatter(
+    pattern: String,
+    zoneId: ZoneId,
+    locale: Locale) extends TimestampFormatter {
+
+  @transient private lazy val format =
+    FastDateFormat.getInstance(pattern, TimeZone.getTimeZone(zoneId), locale)
+
+  protected def toMillis(s: String): Long = format.parse(s).getTime
+
+  override def parse(s: String): Long = toMillis(s) * MICROS_PER_MILLIS
+
+  override def format(us: Long): String = {
+    format.format(DateTimeUtils.toJavaTimestamp(us))
+  }
+}
+
 object TimestampFormatter {
-  val defaultPattern: String = "uuuu-MM-dd HH:mm:ss"
   val defaultLocale: Locale = Locale.US
 
   def apply(format: String, zoneId: ZoneId, locale: Locale): TimestampFormatter = {
-    new Iso8601TimestampFormatter(format, zoneId, locale)
+    if (SQLConf.get.legacyTimeParserEnabled) {
+      new LegacyTimestampFormatter(format, zoneId, locale)
+    } else {
+      new Iso8601TimestampFormatter(format, zoneId, locale)
+    }
   }
 
   def apply(format: String, zoneId: ZoneId): TimestampFormatter = {
     apply(format, zoneId, defaultLocale)
   }
 
   def apply(zoneId: ZoneId): TimestampFormatter = {
-    apply(defaultPattern, zoneId, defaultLocale)
+    if (SQLConf.get.legacyTimeParserEnabled) {
+      new LegacyTimestampFormatter("yyyy-MM-dd HH:mm:ss", zoneId, defaultLocale)
+    } else {
+      new Iso8601TimestampFormatter("uuuu-MM-dd HH:mm:ss", zoneId, defaultLocale)
+    }
   }
 
   def getFractionFormatter(zoneId: ZoneId): TimestampFormatter = {

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2146,6 +2146,14 @@ object SQLConf {
       .checkValue(_ > 0, "The value of spark.sql.addPartitionInBatch.size must be positive")
       .createWithDefault(100)
 
+  val LEGACY_TIME_PARSER_ENABLED = buildConf("spark.sql.legacy.timeParser.enabled")
+    .internal()
+    .doc("When set to true, java.text.SimpleDateFormat is used for formatting and parsing " +
+      "dates/timestamps in a locale-sensitive manner. When set to false, classes from " +
+      "java.time.* packages are used for the same purpose.")
+    .booleanConf
+    .createWithDefault(false)
+
   /**
    * Holds information about keys that have been deprecated.
    *
@@ -2434,6 +2442,8 @@ class SQLConf extends Serializable with Logging {
   def legacyMsSqlServerNumericMappingEnabled: Boolean =
     getConf(LEGACY_MSSQLSERVER_NUMERIC_MAPPING_ENABLED)
 
+  def legacyTimeParserEnabled: Boolean = getConf(SQLConf.LEGACY_TIME_PARSER_ENABLED)
+
   /**
    * Returns the [[Resolver]] for the current configuration, which can be used to determine if two
    * identifiers are equal.

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.sql.catalyst.json
 
-import com.fasterxml.jackson.core.JsonFactory
-
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.plans.SQLHelper
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
@@ -41,45 +40,61 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
   }
 
   test("inferring timestamp type") {
-    checkTimestampType("yyyy", """{"a": "2018"}""")
-    checkTimestampType("yyyy=MM", """{"a": "2018=12"}""")
-    checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""")
-    checkTimestampType(
-      "yyyy-MM-dd'T'HH:mm:ss.SSS",
-      """{"a": "2018-12-02T21:04:00.123"}""")
-    checkTimestampType(
-      "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
-      """{"a": "2018-12-02T21:04:00.123567+01:00"}""")
+    Seq(true, false).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        checkTimestampType("yyyy", """{"a": "2018"}""")
+        checkTimestampType("yyyy=MM", """{"a": "2018=12"}""")
+        checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""")
+        checkTimestampType(
+          "yyyy-MM-dd'T'HH:mm:ss.SSS",
+          """{"a": "2018-12-02T21:04:00.123"}""")
+        checkTimestampType(
+          "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
+          """{"a": "2018-12-02T21:04:00.123567+01:00"}""")
+      }
+    }
   }
 
   test("prefer decimals over timestamps") {
-    checkType(
-      options = Map(
-        "prefersDecimal" -> "true",
-        "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
-      ),
-      json = """{"a": "20181202.210400123"}""",
-      dt = DecimalType(17, 9)
-    )
+    Seq(true, false).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        checkType(
+          options = Map(
+            "prefersDecimal" -> "true",
+            "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
+          ),
+          json = """{"a": "20181202.210400123"}""",
+          dt = DecimalType(17, 9)
+        )
+      }
+    }
   }
 
   test("skip decimal type inferring") {
-    checkType(
-      options = Map(
-        "prefersDecimal" -> "false",
-        "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
-      ),
-      json = """{"a": "20181202.210400123"}""",
-      dt = TimestampType
-    )
+    Seq(true, false).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        checkType(
+          options = Map(
+            "prefersDecimal" -> "false",
+            "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
+          ),
+          json = """{"a": "20181202.210400123"}""",
+          dt = TimestampType
+        )
+      }
+    }
   }
 
   test("fallback to string type") {
-    checkType(
-      options = Map("timestampFormat" -> "yyyy,MM,dd.HHmmssSSS"),
-      json = """{"a": "20181202.210400123"}""",
-      dt = StringType
-    )
+    Seq(true, false).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        checkType(
+          options = Map("timestampFormat" -> "yyyy,MM,dd.HHmmssSSS"),
+          json = """{"a": "20181202.210400123"}""",
+          dt = StringType
+        )
+      }
+    }
   }
 
   test("disable timestamp inferring") {

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -789,4 +789,18 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
         Row(Timestamp.valueOf("2015-07-24 07:00:00")),
         Row(Timestamp.valueOf("2015-07-24 22:00:00"))))
   }
+
+  test("SPARK-30668: use legacy timestamp parser in to_timestamp") {
+    def checkTimeZoneParsing(expected: Any): Unit = {
+      val df = Seq("2020-01-27T20:06:11.847-0800").toDF("ts")
+      checkAnswer(df.select(to_timestamp(col("ts"), "yyyy-MM-dd'T'HH:mm:ss.SSSz")),
+        Row(expected))
+    }
+    withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "true") {
+      checkTimeZoneParsing(Timestamp.valueOf("2020-01-27 20:06:11.847"))
+    }
+    withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "false") {
+      checkTimeZoneParsing(null)
+    }
+  }
 }