apache · MaxGekk · Dec 2, 2018 · Dec 2, 2018 · Dec 2, 2018 · Dec 3, 2018
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.json
 
+import java.text.ParsePosition
 import java.util.Comparator
 
 import scala.util.control.Exception.allCatch
@@ -28,7 +29,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.TypeCoercion
 import org.apache.spark.sql.catalyst.expressions.ExprUtils
 import org.apache.spark.sql.catalyst.json.JacksonUtils.nextUntil
-import org.apache.spark.sql.catalyst.util.{DropMalformedMode, FailFastMode, ParseMode, PermissiveMode}
+import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -37,6 +38,14 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
 
   private val decimalParser = ExprUtils.getDecimalParser(options.locale)
 
+  @transient
+  private lazy val timestampFormatter = TimestampFormatter(
+    options.timestampFormat,
+    options.timeZone,
+    options.locale)
+  @transient
+  private lazy val dateFormatter = DateFormatter(options.dateFormat, options.locale)
+
   /**
    * Infer the type of a collection of json records in three stages:
    *   1. Infer the type of each record
@@ -121,7 +130,15 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
             DecimalType(bigDecimal.precision, bigDecimal.scale)
         }
         decimalTry.getOrElse(StringType)
-      case VALUE_STRING => StringType
+      case VALUE_STRING =>
+        val stringValue = parser.getText
 val unescapedRaw = unescapePathName(raw) 
 // try and parse the date, if no exception occurs this is a candidate to be resolved as 
 // TimestampType 
 DateTimeUtils.getThreadLocalTimestampFormat(timeZone).parse(unescapedRaw) 
 // SPARK-23436: see comment for date 
 val timestampValue = Cast(Literal(unescapedRaw), TimestampType, Some(timeZone.getID)).eval() 
 // Disallow TimestampType if the cast returned null 
 require(timestampValue != null) 
 Literal.create(timestampValue, TimestampType) 
 if ((allCatch opt timeParser.parse(field)).isDefined) { 
 val unescapedRaw = unescapePathName(raw) 
 // try and parse the date, if no exception occurs this is a candidate to be resolved as 
 // TimestampType 
 DateTimeUtils.getThreadLocalTimestampFormat(timeZone).parse(unescapedRaw) 
 // SPARK-23436: see comment for date 
 val timestampValue = Cast(Literal(unescapedRaw), TimestampType, Some(timeZone.getID)).eval() 
 // Disallow TimestampType if the cast returned null 
 require(timestampValue != null) 
 Literal.create(timestampValue, TimestampType) 
 if ((allCatch opt timeParser.parse(field)).isDefined) { 
+        if ((allCatch opt timestampFormatter.parse(stringValue)).isDefined) {
+          TimestampType
+        } else if ((allCatch opt dateFormatter.parse(stringValue)).isDefined) {
+          DateType
+        } else {
+          StringType
+        }
 
       case START_OBJECT =>
         val builder = Array.newBuilder[StructField]

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.json
+
+import com.fasterxml.jackson.core.JsonFactory
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.types._
+
+class JsonInferSchemaSuite extends SparkFunSuite {
+
+  def checkType(options: Map[String, String], json: String, `type`: DataType): Unit = {
+    val jsonOptions = new JSONOptions(options, "GMT", "")
+    val inferSchema = new JsonInferSchema(jsonOptions)
+    val factory = new JsonFactory()
+    jsonOptions.setJacksonOptions(factory)
+    val parser = CreateJacksonParser.string(factory, json)
+    parser.nextToken()
+    val expectedType = StructType(Seq(StructField("a", `type`, true)))
+
+    assert(inferSchema.inferField(parser) === expectedType)
+  }
+
+  def checkTimestampType(pattern: String, json: String): Unit = {
+    checkType(Map("timestampFormat" -> pattern), json, TimestampType)
+  }
+
+  test("inferring timestamp type") {
+    checkTimestampType("yyyy", """{"a": "2018"}""")
+    checkTimestampType("yyyy=MM", """{"a": "2018=12"}""")
+    checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""")
+    checkTimestampType(
+      "yyyy-MM-dd'T'HH:mm:ss.SSS",
+      """{"a": "2018-12-02T21:04:00.123"}""")
+    checkTimestampType(
+      "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
+      """{"a": "2018-12-02T21:04:00.123567+01:00"}""")
+  }
+
+  def checkDateType(pattern: String, json: String): Unit = {
+    checkType(Map("dateFormat" -> pattern), json, DateType)
+  }
+
+  test("inferring date type") {
+    checkDateType("yyyy", """{"a": "2018"}""")
+    checkDateType("yyyy-MM", """{"a": "2018-12"}""")
+    checkDateType("yyyy-MM-dd", """{"a": "2018-12-02"}""")
+  }
+
+  test("strict inferring of date and timestamps") {
+    checkType(
+      options = Map(
+        "dateFormat" -> "yyyy-MM-dd",
+        "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss.SSS"
+      ),
+      json = """{"a": "2018-12-02T21:04:00.123"}""",
+      `type` = TimestampType
+    )
+    checkType(
+      options = Map(
+        "dateFormat" -> "yyyy-MM-dd",
+        "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss.SSS"
+      ),
+      json = """{"a": "2018-12-02"}""",
+    `type` = DateType
+    )
+  }
+}