diff --git a/docs/sql-data-sources-csv.md b/docs/sql-data-sources-csv.md index 7b538528219a..98d31a59ac7a 100644 --- a/docs/sql-data-sources-csv.md +++ b/docs/sql-data-sources-csv.md @@ -109,9 +109,9 @@ Data source options of CSV can be set via: read - inferDate + prefersDate false - Whether or not to infer columns that satisfy the dateFormat option as Date. Requires inferSchema to be true. When false, columns with dates will be inferred as String (or as Timestamp if it fits the timestampFormat). + During schema inference (inferSchema), attempts to infer string columns that contain dates or timestamps as Date if the values satisfy the dateFormat option and failed to be parsed by the respective formatter. With a user-provided schema, attempts to parse timestamp columns as dates using dateFormat if they fail to conform to timestampFormat, in this case the parsed values will be cast to timestamp type afterwards. read @@ -176,8 +176,8 @@ Data source options of CSV can be set via: enableDateTimeParsingFallback - Enabled if the time parser policy is legacy or no custom date or timestamp pattern was provided - Allows to fall back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns. + Enabled if the time parser policy has legacy settings or if no custom date or timestamp pattern was provided. + Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns. read diff --git a/docs/sql-data-sources-json.md b/docs/sql-data-sources-json.md index 500cd65b58b8..a0772dd3656f 100644 --- a/docs/sql-data-sources-json.md +++ b/docs/sql-data-sources-json.md @@ -204,8 +204,8 @@ Data source options of JSON can be set via: enableDateTimeParsingFallback - Enabled if the time parser policy is legacy or no custom date or timestamp pattern was provided - Allows to fall back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns. + Enabled if the time parser policy has legacy settings or if no custom date or timestamp pattern was provided. + Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns. read diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala index 3132fea8700b..53d748989204 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala @@ -124,9 +124,9 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { case _: DecimalType => tryParseDecimal(field) case DoubleType => tryParseDouble(field) case DateType => tryParseDateTime(field) - case TimestampNTZType if options.inferDate => tryParseDateTime(field) + case TimestampNTZType if options.prefersDate => tryParseDateTime(field) case TimestampNTZType => tryParseTimestampNTZ(field) - case TimestampType if options.inferDate => tryParseDateTime(field) + case TimestampType if options.prefersDate => tryParseDateTime(field) case TimestampType => tryParseTimestamp(field) case BooleanType => tryParseBoolean(field) case StringType => StringType @@ -178,7 +178,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { private def tryParseDouble(field: String): DataType = { if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) { DoubleType - } else if (options.inferDate) { + } else if (options.prefersDate) { tryParseDateTime(field) } else { tryParseTimestampNTZ(field) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 27806ea1c403..1162c2882dd7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -149,23 +149,29 @@ class CSVOptions( val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US) /** - * Infer columns with all valid date entries as date type (otherwise inferred as timestamp type). - * Disabled by default for backwards compatibility and performance. When enabled, date entries in - * timestamp columns will be cast to timestamp upon parsing. Not compatible with - * legacyTimeParserPolicy == LEGACY since legacy date parser will accept extra trailing characters + * Infer columns with all valid date entries as date type (otherwise inferred as timestamp type) + * if schema inference is enabled. When being used with user-provided schema, tries to parse + * timestamp values as dates if the values do not conform to the timestamp formatter before + * falling back to the backward compatible parsing - the parsed values will be cast to timestamp + * afterwards. + * + * Disabled by default for backwards compatibility and performance. + * + * Not compatible with legacyTimeParserPolicy == LEGACY since legacy date parser will accept + * extra trailing characters. */ - val inferDate = { - val inferDateFlag = getBool("inferDate") - if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY && inferDateFlag) { + val prefersDate = { + val inferDateFlag = getBool("prefersDate") + if (inferDateFlag && SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { throw QueryExecutionErrors.inferDateWithLegacyTimeParserError() } inferDateFlag } - // Provide a default value for dateFormatInRead when inferDate. This ensures that the + // Provide a default value for dateFormatInRead when prefersDate. This ensures that the // Iso8601DateFormatter (with strict date parsing) is used for date inference val dateFormatInRead: Option[String] = - if (inferDate) { + if (prefersDate) { Option(parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)) } else { parameters.get("dateFormat") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index a6b4d7ea6679..c9955d72524c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -235,7 +235,7 @@ class UnivocityParser( } catch { case NonFatal(e) => // There may be date type entries in timestamp column due to schema inference - if (options.inferDate) { + if (options.prefersDate) { daysToMicros(dateFormatter.parse(datum), options.zoneId) } else { // If fails to parse, then tries the way used in 2.0 and 1.x for backwards @@ -254,7 +254,7 @@ class UnivocityParser( try { timestampNTZFormatter.parseWithoutTimeZone(datum, false) } catch { - case NonFatal(e) if (options.inferDate) => + case NonFatal(e) if options.prefersDate => daysToMicros(dateFormatter.parse(datum), TimeZoneUTC.toZoneId) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala index 8790223a680f..7066a5614ee9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala @@ -201,19 +201,19 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper { test("SPARK-39469: inferring date type") { // "yyyy/MM/dd" format - var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "inferDate" -> "true"), + var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "prefersDate" -> "true"), false, "UTC") var inferSchema = new CSVInferSchema(options) assert(inferSchema.inferField(NullType, "2018/12/02") == DateType) // "MMM yyyy" format - options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "inferDate" -> "true"), + options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "prefersDate" -> "true"), false, "GMT") inferSchema = new CSVInferSchema(options) assert(inferSchema.inferField(NullType, "Dec 2018") == DateType) // Field should strictly match date format to infer as date options = new CSVOptions( Map("dateFormat" -> "yyyy-MM-dd", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", - "inferDate" -> "true"), + "prefersDate" -> "true"), columnPruning = false, defaultTimeZoneId = "GMT") inferSchema = new CSVInferSchema(options) @@ -221,10 +221,10 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper { assert(inferSchema.inferField(NullType, "2018-12-03") == DateType) } - test("SPARK-39469: inferring date and timestamp types in a mixed column with inferDate=true") { + test("SPARK-39469: inferring date and timestamp types in a mixed column with prefersDate=true") { var options = new CSVOptions( Map("dateFormat" -> "yyyy_MM_dd", "timestampFormat" -> "yyyy|MM|dd", - "timestampNTZFormat" -> "yyyy/MM/dd", "inferDate" -> "true"), + "timestampNTZFormat" -> "yyyy/MM/dd", "prefersDate" -> "true"), columnPruning = false, defaultTimeZoneId = "UTC") var inferSchema = new CSVInferSchema(options) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala index 381ec57fcd13..42bc122dfdcb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala @@ -373,10 +373,10 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper { assert(err.getMessage.contains("Illegal pattern character: n")) } - test("SPARK-39469: dates should be parsed correctly in a timestamp column when inferDate=true") { + test("SPARK-39469: dates should be parsed correctly in timestamp column when prefersDate=true") { def checkDate(dataType: DataType): Unit = { val timestampsOptions = - new CSVOptions(Map("inferDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm", + new CSVOptions(Map("prefersDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm", "timestampNTZFormat" -> "dd-MM-yyyy HH:mm", "dateFormat" -> "dd_MM_yyyy"), false, DateTimeUtils.getZoneId("-08:00").toString) // Use CSVOption ZoneId="-08:00" (PST) to test that Dates in TimestampNTZ column are always diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 0e5718103902..0068f57a7697 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -2797,13 +2797,13 @@ abstract class CSVSuite "inferSchema" -> "true", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", "dateFormat" -> "yyyy-MM-dd", - "inferDate" -> "true") + "prefersDate" -> "true") val options2 = Map( "header" -> "true", "inferSchema" -> "true", - "inferDate" -> "true") + "prefersDate" -> "true") - // Error should be thrown when attempting to inferDate with Legacy parser + // Error should be thrown when attempting to prefersDate with Legacy parser if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { val msg = intercept[IllegalArgumentException] { spark.read @@ -2840,6 +2840,42 @@ abstract class CSVSuite } } + test("SPARK-39904: Parse incorrect timestamp values with prefersDate=true") { + withTempPath { path => + Seq( + "2020-02-01 12:34:56", + "2020-02-02", + "invalid" + ).toDF() + .repartition(1) + .write.text(path.getAbsolutePath) + + val schema = new StructType() + .add("ts", TimestampType) + + val output = spark.read + .schema(schema) + .option("prefersDate", "true") + .csv(path.getAbsolutePath) + + if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { + val msg = intercept[IllegalArgumentException] { + output.collect() + }.getMessage + assert(msg.contains("CANNOT_INFER_DATE")) + } else { + checkAnswer( + output, + Seq( + Row(Timestamp.valueOf("2020-02-01 12:34:56")), + Row(Timestamp.valueOf("2020-02-02 00:00:00")), + Row(null) + ) + ) + } + } + } + test("SPARK-39731: Correctly parse dates and timestamps with yyyyMMdd pattern") { withTempPath { path => Seq(