From cb4147c3586a2df0bd803f6afae824c6132895c9 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Thu, 28 Jul 2022 17:26:32 +1200 Subject: [PATCH 1/7] rename the option --- .../src/main/resources/error/error-classes.json | 6 ++++++ docs/sql-data-sources-csv.md | 2 +- .../spark/sql/catalyst/csv/CSVInferSchema.scala | 6 +++--- .../spark/sql/catalyst/csv/CSVOptions.scala | 17 +++++++++++------ .../sql/catalyst/csv/UnivocityParser.scala | 4 ++-- .../spark/sql/errors/QueryExecutionErrors.scala | 6 ++++++ .../sql/catalyst/csv/CSVInferSchemaSuite.scala | 10 +++++----- .../sql/catalyst/csv/UnivocityParserSuite.scala | 4 ++-- .../execution/datasources/csv/CSVSuite.scala | 17 ++++++++++++++--- 9 files changed, 50 insertions(+), 22 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index c4b59799f88d..15084b2a0b50 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -29,6 +29,12 @@ ], "sqlState" : "22007" }, + "CANNOT_INFER_DATE_WITHOUT_INFER_SCHEMA" : { + "message" : [ + "Cannot infer date when schema inference is disabled." + ], + "sqlState" : "22007" + }, "CANNOT_PARSE_DECIMAL" : { "message" : [ "Cannot parse decimal" diff --git a/docs/sql-data-sources-csv.md b/docs/sql-data-sources-csv.md index 7b538528219a..ebdaa6864610 100644 --- a/docs/sql-data-sources-csv.md +++ b/docs/sql-data-sources-csv.md @@ -109,7 +109,7 @@ Data source options of CSV can be set via: read - inferDate + preferDate false Whether or not to infer columns that satisfy the dateFormat option as Date. Requires inferSchema to be true. When false, columns with dates will be inferred as String (or as Timestamp if it fits the timestampFormat). read diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala index 3132fea8700b..1c3f7fe2b92f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala @@ -124,9 +124,9 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { case _: DecimalType => tryParseDecimal(field) case DoubleType => tryParseDouble(field) case DateType => tryParseDateTime(field) - case TimestampNTZType if options.inferDate => tryParseDateTime(field) + case TimestampNTZType if options.preferDate => tryParseDateTime(field) case TimestampNTZType => tryParseTimestampNTZ(field) - case TimestampType if options.inferDate => tryParseDateTime(field) + case TimestampType if options.preferDate => tryParseDateTime(field) case TimestampType => tryParseTimestamp(field) case BooleanType => tryParseBoolean(field) case StringType => StringType @@ -178,7 +178,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { private def tryParseDouble(field: String): DataType = { if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) { DoubleType - } else if (options.inferDate) { + } else if (options.preferDate) { tryParseDateTime(field) } else { tryParseTimestampNTZ(field) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 27806ea1c403..4a7012a1767f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -153,19 +153,24 @@ class CSVOptions( * Disabled by default for backwards compatibility and performance. When enabled, date entries in * timestamp columns will be cast to timestamp upon parsing. Not compatible with * legacyTimeParserPolicy == LEGACY since legacy date parser will accept extra trailing characters + * + * The flag is only enabled if inferSchema is set to true. */ - val inferDate = { - val inferDateFlag = getBool("inferDate") - if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY && inferDateFlag) { + val preferDate = { + val preferDateFlag = getBool("preferDate") + if (preferDateFlag && SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { throw QueryExecutionErrors.inferDateWithLegacyTimeParserError() } - inferDateFlag + if (preferDateFlag && !inferSchemaFlag) { + throw QueryExecutionErrors.inferDateWithoutInferSchemaError() + } + preferDateFlag } - // Provide a default value for dateFormatInRead when inferDate. This ensures that the + // Provide a default value for dateFormatInRead when preferDate. This ensures that the // Iso8601DateFormatter (with strict date parsing) is used for date inference val dateFormatInRead: Option[String] = - if (inferDate) { + if (preferDate) { Option(parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)) } else { parameters.get("dateFormat") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index a6b4d7ea6679..cc44a1b71751 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -235,7 +235,7 @@ class UnivocityParser( } catch { case NonFatal(e) => // There may be date type entries in timestamp column due to schema inference - if (options.inferDate) { + if (options.preferDate) { daysToMicros(dateFormatter.parse(datum), options.zoneId) } else { // If fails to parse, then tries the way used in 2.0 and 1.x for backwards @@ -254,7 +254,7 @@ class UnivocityParser( try { timestampNTZFormatter.parseWithoutTimeZone(datum, false) } catch { - case NonFatal(e) if (options.inferDate) => + case NonFatal(e) if options.preferDate => daysToMicros(dateFormatter.parse(datum), TimeZoneUTC.toZoneId) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index 35a40ce684f3..e05679e2ceef 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -575,6 +575,12 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase { ) } + def inferDateWithoutInferSchemaError(): Throwable with SparkThrowable = { + new SparkIllegalArgumentException(errorClass = "CANNOT_INFER_DATE_WITHOUT_INFER_SCHEMA", + messageParameters = Array() + ) + } + def streamedOperatorUnsupportedByDataSourceError( className: String, operator: String): Throwable = { new UnsupportedOperationException( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala index 8790223a680f..2269c4331bcd 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala @@ -201,19 +201,19 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper { test("SPARK-39469: inferring date type") { // "yyyy/MM/dd" format - var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "inferDate" -> "true"), + var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "preferDate" -> "true"), false, "UTC") var inferSchema = new CSVInferSchema(options) assert(inferSchema.inferField(NullType, "2018/12/02") == DateType) // "MMM yyyy" format - options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "inferDate" -> "true"), + options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "preferDate" -> "true"), false, "GMT") inferSchema = new CSVInferSchema(options) assert(inferSchema.inferField(NullType, "Dec 2018") == DateType) // Field should strictly match date format to infer as date options = new CSVOptions( Map("dateFormat" -> "yyyy-MM-dd", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", - "inferDate" -> "true"), + "preferDate" -> "true"), columnPruning = false, defaultTimeZoneId = "GMT") inferSchema = new CSVInferSchema(options) @@ -221,10 +221,10 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper { assert(inferSchema.inferField(NullType, "2018-12-03") == DateType) } - test("SPARK-39469: inferring date and timestamp types in a mixed column with inferDate=true") { + test("SPARK-39469: inferring date and timestamp types in a mixed column with preferDate=true") { var options = new CSVOptions( Map("dateFormat" -> "yyyy_MM_dd", "timestampFormat" -> "yyyy|MM|dd", - "timestampNTZFormat" -> "yyyy/MM/dd", "inferDate" -> "true"), + "timestampNTZFormat" -> "yyyy/MM/dd", "preferDate" -> "true"), columnPruning = false, defaultTimeZoneId = "UTC") var inferSchema = new CSVInferSchema(options) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala index 381ec57fcd13..7dbe89f9b513 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala @@ -373,10 +373,10 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper { assert(err.getMessage.contains("Illegal pattern character: n")) } - test("SPARK-39469: dates should be parsed correctly in a timestamp column when inferDate=true") { + test("SPARK-39469: dates should be parsed correctly in a timestamp column when preferDate=true") { def checkDate(dataType: DataType): Unit = { val timestampsOptions = - new CSVOptions(Map("inferDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm", + new CSVOptions(Map("preferDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm", "timestampNTZFormat" -> "dd-MM-yyyy HH:mm", "dateFormat" -> "dd_MM_yyyy"), false, DateTimeUtils.getZoneId("-08:00").toString) // Use CSVOption ZoneId="-08:00" (PST) to test that Dates in TimestampNTZ column are always diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 0e5718103902..d95a47ff6563 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -2797,13 +2797,13 @@ abstract class CSVSuite "inferSchema" -> "true", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", "dateFormat" -> "yyyy-MM-dd", - "inferDate" -> "true") + "preferDate" -> "true") val options2 = Map( "header" -> "true", "inferSchema" -> "true", - "inferDate" -> "true") + "preferDate" -> "true") - // Error should be thrown when attempting to inferDate with Legacy parser + // Error should be thrown when attempting to preferDate with Legacy parser if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { val msg = intercept[IllegalArgumentException] { spark.read @@ -2840,6 +2840,17 @@ abstract class CSVSuite } } + test("SPARK-39904: Fail to prefer dates if inferSchema=false") { + val msg = intercept[IllegalArgumentException] { + spark.read + .format("csv") + .option("inferSchema", "false") + .option("preferDate", "true") + .load(testFile(dateInferSchemaFile)) + }.getMessage + assert(msg.contains("CANNOT_INFER_DATE_WITHOUT_INFER_SCHEMA")) + } + test("SPARK-39731: Correctly parse dates and timestamps with yyyyMMdd pattern") { withTempPath { path => Seq( From a865018227644ce994c3b20721aa5bcf6444de8e Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Mon, 1 Aug 2022 11:42:45 +1200 Subject: [PATCH 2/7] revert rename --- core/src/main/resources/error/error-classes.json | 6 ------ docs/sql-data-sources-csv.md | 2 +- .../spark/sql/catalyst/csv/CSVInferSchema.scala | 6 +++--- .../spark/sql/catalyst/csv/CSVOptions.scala | 15 ++++++--------- .../spark/sql/catalyst/csv/UnivocityParser.scala | 4 ++-- .../spark/sql/errors/QueryExecutionErrors.scala | 6 ------ .../sql/catalyst/csv/CSVInferSchemaSuite.scala | 10 +++++----- .../sql/catalyst/csv/UnivocityParserSuite.scala | 4 ++-- .../sql/execution/datasources/csv/CSVSuite.scala | 8 ++++---- 9 files changed, 23 insertions(+), 38 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 15084b2a0b50..c4b59799f88d 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -29,12 +29,6 @@ ], "sqlState" : "22007" }, - "CANNOT_INFER_DATE_WITHOUT_INFER_SCHEMA" : { - "message" : [ - "Cannot infer date when schema inference is disabled." - ], - "sqlState" : "22007" - }, "CANNOT_PARSE_DECIMAL" : { "message" : [ "Cannot parse decimal" diff --git a/docs/sql-data-sources-csv.md b/docs/sql-data-sources-csv.md index ebdaa6864610..57c8f67839bd 100644 --- a/docs/sql-data-sources-csv.md +++ b/docs/sql-data-sources-csv.md @@ -109,7 +109,7 @@ Data source options of CSV can be set via: read - preferDate + inferDate false Whether or not to infer columns that satisfy the dateFormat option as Date. Requires inferSchema to be true. When false, columns with dates will be inferred as String (or as Timestamp if it fits the timestampFormat). read diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala index 1c3f7fe2b92f..3132fea8700b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala @@ -124,9 +124,9 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { case _: DecimalType => tryParseDecimal(field) case DoubleType => tryParseDouble(field) case DateType => tryParseDateTime(field) - case TimestampNTZType if options.preferDate => tryParseDateTime(field) + case TimestampNTZType if options.inferDate => tryParseDateTime(field) case TimestampNTZType => tryParseTimestampNTZ(field) - case TimestampType if options.preferDate => tryParseDateTime(field) + case TimestampType if options.inferDate => tryParseDateTime(field) case TimestampType => tryParseTimestamp(field) case BooleanType => tryParseBoolean(field) case StringType => StringType @@ -178,7 +178,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { private def tryParseDouble(field: String): DataType = { if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) { DoubleType - } else if (options.preferDate) { + } else if (options.inferDate) { tryParseDateTime(field) } else { tryParseTimestampNTZ(field) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 4a7012a1767f..dfc3abe823bb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -156,21 +156,18 @@ class CSVOptions( * * The flag is only enabled if inferSchema is set to true. */ - val preferDate = { - val preferDateFlag = getBool("preferDate") - if (preferDateFlag && SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { + val inferDate = { + val inferDateFlag = getBool("inferDate") + if (inferDateFlag && SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { throw QueryExecutionErrors.inferDateWithLegacyTimeParserError() } - if (preferDateFlag && !inferSchemaFlag) { - throw QueryExecutionErrors.inferDateWithoutInferSchemaError() - } - preferDateFlag + inferDateFlag } - // Provide a default value for dateFormatInRead when preferDate. This ensures that the + // Provide a default value for dateFormatInRead when inferDate. This ensures that the // Iso8601DateFormatter (with strict date parsing) is used for date inference val dateFormatInRead: Option[String] = - if (preferDate) { + if (inferDate) { Option(parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)) } else { parameters.get("dateFormat") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index cc44a1b71751..aea8cb49e7b7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -235,7 +235,7 @@ class UnivocityParser( } catch { case NonFatal(e) => // There may be date type entries in timestamp column due to schema inference - if (options.preferDate) { + if (options.inferDate) { daysToMicros(dateFormatter.parse(datum), options.zoneId) } else { // If fails to parse, then tries the way used in 2.0 and 1.x for backwards @@ -254,7 +254,7 @@ class UnivocityParser( try { timestampNTZFormatter.parseWithoutTimeZone(datum, false) } catch { - case NonFatal(e) if options.preferDate => + case NonFatal(e) if options.inferDate => daysToMicros(dateFormatter.parse(datum), TimeZoneUTC.toZoneId) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index e05679e2ceef..35a40ce684f3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -575,12 +575,6 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase { ) } - def inferDateWithoutInferSchemaError(): Throwable with SparkThrowable = { - new SparkIllegalArgumentException(errorClass = "CANNOT_INFER_DATE_WITHOUT_INFER_SCHEMA", - messageParameters = Array() - ) - } - def streamedOperatorUnsupportedByDataSourceError( className: String, operator: String): Throwable = { new UnsupportedOperationException( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala index 2269c4331bcd..8790223a680f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala @@ -201,19 +201,19 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper { test("SPARK-39469: inferring date type") { // "yyyy/MM/dd" format - var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "preferDate" -> "true"), + var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "inferDate" -> "true"), false, "UTC") var inferSchema = new CSVInferSchema(options) assert(inferSchema.inferField(NullType, "2018/12/02") == DateType) // "MMM yyyy" format - options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "preferDate" -> "true"), + options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "inferDate" -> "true"), false, "GMT") inferSchema = new CSVInferSchema(options) assert(inferSchema.inferField(NullType, "Dec 2018") == DateType) // Field should strictly match date format to infer as date options = new CSVOptions( Map("dateFormat" -> "yyyy-MM-dd", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", - "preferDate" -> "true"), + "inferDate" -> "true"), columnPruning = false, defaultTimeZoneId = "GMT") inferSchema = new CSVInferSchema(options) @@ -221,10 +221,10 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper { assert(inferSchema.inferField(NullType, "2018-12-03") == DateType) } - test("SPARK-39469: inferring date and timestamp types in a mixed column with preferDate=true") { + test("SPARK-39469: inferring date and timestamp types in a mixed column with inferDate=true") { var options = new CSVOptions( Map("dateFormat" -> "yyyy_MM_dd", "timestampFormat" -> "yyyy|MM|dd", - "timestampNTZFormat" -> "yyyy/MM/dd", "preferDate" -> "true"), + "timestampNTZFormat" -> "yyyy/MM/dd", "inferDate" -> "true"), columnPruning = false, defaultTimeZoneId = "UTC") var inferSchema = new CSVInferSchema(options) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala index 7dbe89f9b513..381ec57fcd13 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala @@ -373,10 +373,10 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper { assert(err.getMessage.contains("Illegal pattern character: n")) } - test("SPARK-39469: dates should be parsed correctly in a timestamp column when preferDate=true") { + test("SPARK-39469: dates should be parsed correctly in a timestamp column when inferDate=true") { def checkDate(dataType: DataType): Unit = { val timestampsOptions = - new CSVOptions(Map("preferDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm", + new CSVOptions(Map("inferDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm", "timestampNTZFormat" -> "dd-MM-yyyy HH:mm", "dateFormat" -> "dd_MM_yyyy"), false, DateTimeUtils.getZoneId("-08:00").toString) // Use CSVOption ZoneId="-08:00" (PST) to test that Dates in TimestampNTZ column are always diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index d95a47ff6563..8341d4f1b339 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -2797,13 +2797,13 @@ abstract class CSVSuite "inferSchema" -> "true", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", "dateFormat" -> "yyyy-MM-dd", - "preferDate" -> "true") + "inferDate" -> "true") val options2 = Map( "header" -> "true", "inferSchema" -> "true", - "preferDate" -> "true") + "inferDate" -> "true") - // Error should be thrown when attempting to preferDate with Legacy parser + // Error should be thrown when attempting to inferDate with Legacy parser if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { val msg = intercept[IllegalArgumentException] { spark.read @@ -2845,7 +2845,7 @@ abstract class CSVSuite spark.read .format("csv") .option("inferSchema", "false") - .option("preferDate", "true") + .option("inferDate", "true") .load(testFile(dateInferSchemaFile)) }.getMessage assert(msg.contains("CANNOT_INFER_DATE_WITHOUT_INFER_SCHEMA")) From b475d514c75ad67af6aa3945418b9aa29c4d7b8b Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Mon, 1 Aug 2022 12:31:52 +1200 Subject: [PATCH 3/7] update documentation and test --- docs/sql-data-sources-csv.md | 6 +-- docs/sql-data-sources-json.md | 4 +- .../spark/sql/catalyst/csv/CSVOptions.scala | 14 ++++--- .../execution/datasources/csv/CSVSuite.scala | 41 +++++++++++++++---- 4 files changed, 47 insertions(+), 18 deletions(-) diff --git a/docs/sql-data-sources-csv.md b/docs/sql-data-sources-csv.md index 57c8f67839bd..162acb04845e 100644 --- a/docs/sql-data-sources-csv.md +++ b/docs/sql-data-sources-csv.md @@ -111,7 +111,7 @@ Data source options of CSV can be set via: inferDate false - Whether or not to infer columns that satisfy the dateFormat option as Date. Requires inferSchema to be true. When false, columns with dates will be inferred as String (or as Timestamp if it fits the timestampFormat). + Attempts to infer string columns that contain dates or timestamps as Date if the values satisfy dateFormat option and failed to be parsed by the respective formatter during schema inference (inferSchema). When used in conjunction with a user-provided schema, attempts parse timestamp columns as dates using dateFormat if they fail to conform to timestampFormat, the parsed values will be cast to timestamp type afterwards. read @@ -176,8 +176,8 @@ Data source options of CSV can be set via: enableDateTimeParsingFallback - Enabled if the time parser policy is legacy or no custom date or timestamp pattern was provided - Allows to fall back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns. + Enabled if the time parser policy has legacy settings or if no custom date or timestamp pattern was provided. + Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns. read diff --git a/docs/sql-data-sources-json.md b/docs/sql-data-sources-json.md index 500cd65b58b8..a0772dd3656f 100644 --- a/docs/sql-data-sources-json.md +++ b/docs/sql-data-sources-json.md @@ -204,8 +204,8 @@ Data source options of JSON can be set via: enableDateTimeParsingFallback - Enabled if the time parser policy is legacy or no custom date or timestamp pattern was provided - Allows to fall back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns. + Enabled if the time parser policy has legacy settings or if no custom date or timestamp pattern was provided. + Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns. read diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index dfc3abe823bb..e34d730f1747 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -149,12 +149,16 @@ class CSVOptions( val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US) /** - * Infer columns with all valid date entries as date type (otherwise inferred as timestamp type). - * Disabled by default for backwards compatibility and performance. When enabled, date entries in - * timestamp columns will be cast to timestamp upon parsing. Not compatible with - * legacyTimeParserPolicy == LEGACY since legacy date parser will accept extra trailing characters + * Infer columns with all valid date entries as date type (otherwise inferred as timestamp type) + * if schema inference is enabled. When being used with user-provided schema, tries to parse + * timestamp values as dates if the values do not conform to the timestamp formatter before + * falling back to the backward compatible parsing - the parsed values will be cast to timestamp + * afterwards. * - * The flag is only enabled if inferSchema is set to true. + * Disabled by default for backwards compatibility and performance. + * + * Not compatible with legacyTimeParserPolicy == LEGACY since legacy date parser will accept + * extra trailing characters. */ val inferDate = { val inferDateFlag = getBool("inferDate") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 8341d4f1b339..934fa02e748d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -2840,15 +2840,40 @@ abstract class CSVSuite } } - test("SPARK-39904: Fail to prefer dates if inferSchema=false") { - val msg = intercept[IllegalArgumentException] { - spark.read - .format("csv") - .option("inferSchema", "false") + test("SPARK-39904: Parse incorrect timestamp values with inferDate=true") { + withTempPath { path => + Seq( + "2020-02-01 12:34:56", + "2020-02-02", + "invalid" + ).toDF() + .repartition(1) + .write.text(path.getAbsolutePath) + + val schema = new StructType() + .add("ts", TimestampType) + + val output = spark.read + .schema(schema) .option("inferDate", "true") - .load(testFile(dateInferSchemaFile)) - }.getMessage - assert(msg.contains("CANNOT_INFER_DATE_WITHOUT_INFER_SCHEMA")) + .csv(path.getAbsolutePath) + + if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { + val msg = intercept[IllegalArgumentException] { + output.collect() + }.getMessage + assert(msg.contains("CANNOT_INFER_DATE")) + } else { + checkAnswer( + output, + Seq( + Row(Timestamp.valueOf("2020-02-01 12:34:56")), + Row(Timestamp.valueOf("2020-02-02 00:00:00")), + Row(null) + ) + ) + } + } } test("SPARK-39731: Correctly parse dates and timestamps with yyyyMMdd pattern") { From 6e75a735c1d2708fe25db0d347ce766193d0d8f8 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Mon, 1 Aug 2022 14:12:44 +1200 Subject: [PATCH 4/7] rename inferDate to prefersDate --- docs/sql-data-sources-csv.md | 2 +- .../apache/spark/sql/catalyst/csv/CSVInferSchema.scala | 6 +++--- .../org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 8 ++++---- .../spark/sql/catalyst/csv/UnivocityParser.scala | 4 ++-- .../spark/sql/catalyst/csv/CSVInferSchemaSuite.scala | 10 +++++----- .../spark/sql/catalyst/csv/UnivocityParserSuite.scala | 4 ++-- .../spark/sql/execution/datasources/csv/CSVSuite.scala | 10 +++++----- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/docs/sql-data-sources-csv.md b/docs/sql-data-sources-csv.md index 162acb04845e..ebb6d0d1c01b 100644 --- a/docs/sql-data-sources-csv.md +++ b/docs/sql-data-sources-csv.md @@ -109,7 +109,7 @@ Data source options of CSV can be set via: read - inferDate + prefersDate false Attempts to infer string columns that contain dates or timestamps as Date if the values satisfy dateFormat option and failed to be parsed by the respective formatter during schema inference (inferSchema). When used in conjunction with a user-provided schema, attempts parse timestamp columns as dates using dateFormat if they fail to conform to timestampFormat, the parsed values will be cast to timestamp type afterwards. read diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala index 3132fea8700b..53d748989204 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala @@ -124,9 +124,9 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { case _: DecimalType => tryParseDecimal(field) case DoubleType => tryParseDouble(field) case DateType => tryParseDateTime(field) - case TimestampNTZType if options.inferDate => tryParseDateTime(field) + case TimestampNTZType if options.prefersDate => tryParseDateTime(field) case TimestampNTZType => tryParseTimestampNTZ(field) - case TimestampType if options.inferDate => tryParseDateTime(field) + case TimestampType if options.prefersDate => tryParseDateTime(field) case TimestampType => tryParseTimestamp(field) case BooleanType => tryParseBoolean(field) case StringType => StringType @@ -178,7 +178,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { private def tryParseDouble(field: String): DataType = { if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) { DoubleType - } else if (options.inferDate) { + } else if (options.prefersDate) { tryParseDateTime(field) } else { tryParseTimestampNTZ(field) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index e34d730f1747..1162c2882dd7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -160,18 +160,18 @@ class CSVOptions( * Not compatible with legacyTimeParserPolicy == LEGACY since legacy date parser will accept * extra trailing characters. */ - val inferDate = { - val inferDateFlag = getBool("inferDate") + val prefersDate = { + val inferDateFlag = getBool("prefersDate") if (inferDateFlag && SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { throw QueryExecutionErrors.inferDateWithLegacyTimeParserError() } inferDateFlag } - // Provide a default value for dateFormatInRead when inferDate. This ensures that the + // Provide a default value for dateFormatInRead when prefersDate. This ensures that the // Iso8601DateFormatter (with strict date parsing) is used for date inference val dateFormatInRead: Option[String] = - if (inferDate) { + if (prefersDate) { Option(parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)) } else { parameters.get("dateFormat") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index aea8cb49e7b7..c9955d72524c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -235,7 +235,7 @@ class UnivocityParser( } catch { case NonFatal(e) => // There may be date type entries in timestamp column due to schema inference - if (options.inferDate) { + if (options.prefersDate) { daysToMicros(dateFormatter.parse(datum), options.zoneId) } else { // If fails to parse, then tries the way used in 2.0 and 1.x for backwards @@ -254,7 +254,7 @@ class UnivocityParser( try { timestampNTZFormatter.parseWithoutTimeZone(datum, false) } catch { - case NonFatal(e) if options.inferDate => + case NonFatal(e) if options.prefersDate => daysToMicros(dateFormatter.parse(datum), TimeZoneUTC.toZoneId) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala index 8790223a680f..7066a5614ee9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala @@ -201,19 +201,19 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper { test("SPARK-39469: inferring date type") { // "yyyy/MM/dd" format - var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "inferDate" -> "true"), + var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "prefersDate" -> "true"), false, "UTC") var inferSchema = new CSVInferSchema(options) assert(inferSchema.inferField(NullType, "2018/12/02") == DateType) // "MMM yyyy" format - options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "inferDate" -> "true"), + options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "prefersDate" -> "true"), false, "GMT") inferSchema = new CSVInferSchema(options) assert(inferSchema.inferField(NullType, "Dec 2018") == DateType) // Field should strictly match date format to infer as date options = new CSVOptions( Map("dateFormat" -> "yyyy-MM-dd", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", - "inferDate" -> "true"), + "prefersDate" -> "true"), columnPruning = false, defaultTimeZoneId = "GMT") inferSchema = new CSVInferSchema(options) @@ -221,10 +221,10 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper { assert(inferSchema.inferField(NullType, "2018-12-03") == DateType) } - test("SPARK-39469: inferring date and timestamp types in a mixed column with inferDate=true") { + test("SPARK-39469: inferring date and timestamp types in a mixed column with prefersDate=true") { var options = new CSVOptions( Map("dateFormat" -> "yyyy_MM_dd", "timestampFormat" -> "yyyy|MM|dd", - "timestampNTZFormat" -> "yyyy/MM/dd", "inferDate" -> "true"), + "timestampNTZFormat" -> "yyyy/MM/dd", "prefersDate" -> "true"), columnPruning = false, defaultTimeZoneId = "UTC") var inferSchema = new CSVInferSchema(options) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala index 381ec57fcd13..fdb884c6cf22 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala @@ -373,10 +373,10 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper { assert(err.getMessage.contains("Illegal pattern character: n")) } - test("SPARK-39469: dates should be parsed correctly in a timestamp column when inferDate=true") { + test("SPARK-39469: dates should be parsed correctly in a timestamp column when prefersDate=true") { def checkDate(dataType: DataType): Unit = { val timestampsOptions = - new CSVOptions(Map("inferDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm", + new CSVOptions(Map("prefersDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm", "timestampNTZFormat" -> "dd-MM-yyyy HH:mm", "dateFormat" -> "dd_MM_yyyy"), false, DateTimeUtils.getZoneId("-08:00").toString) // Use CSVOption ZoneId="-08:00" (PST) to test that Dates in TimestampNTZ column are always diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 934fa02e748d..0068f57a7697 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -2797,13 +2797,13 @@ abstract class CSVSuite "inferSchema" -> "true", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", "dateFormat" -> "yyyy-MM-dd", - "inferDate" -> "true") + "prefersDate" -> "true") val options2 = Map( "header" -> "true", "inferSchema" -> "true", - "inferDate" -> "true") + "prefersDate" -> "true") - // Error should be thrown when attempting to inferDate with Legacy parser + // Error should be thrown when attempting to prefersDate with Legacy parser if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { val msg = intercept[IllegalArgumentException] { spark.read @@ -2840,7 +2840,7 @@ abstract class CSVSuite } } - test("SPARK-39904: Parse incorrect timestamp values with inferDate=true") { + test("SPARK-39904: Parse incorrect timestamp values with prefersDate=true") { withTempPath { path => Seq( "2020-02-01 12:34:56", @@ -2855,7 +2855,7 @@ abstract class CSVSuite val output = spark.read .schema(schema) - .option("inferDate", "true") + .option("prefersDate", "true") .csv(path.getAbsolutePath) if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { From 71a095f4678a4a7ec48ff82ab206833f06f6b972 Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Mon, 1 Aug 2022 17:29:04 +1200 Subject: [PATCH 5/7] fix scalastyle --- .../apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala index fdb884c6cf22..42bc122dfdcb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala @@ -373,7 +373,7 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper { assert(err.getMessage.contains("Illegal pattern character: n")) } - test("SPARK-39469: dates should be parsed correctly in a timestamp column when prefersDate=true") { + test("SPARK-39469: dates should be parsed correctly in timestamp column when prefersDate=true") { def checkDate(dataType: DataType): Unit = { val timestampsOptions = new CSVOptions(Map("prefersDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm", From 5b01e20c8531d92231419af8fe1f537c82250ffe Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Mon, 1 Aug 2022 17:38:42 +1200 Subject: [PATCH 6/7] update documentation --- docs/sql-data-sources-csv.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-data-sources-csv.md b/docs/sql-data-sources-csv.md index ebb6d0d1c01b..0552f4393998 100644 --- a/docs/sql-data-sources-csv.md +++ b/docs/sql-data-sources-csv.md @@ -111,7 +111,7 @@ Data source options of CSV can be set via: prefersDate false - Attempts to infer string columns that contain dates or timestamps as Date if the values satisfy dateFormat option and failed to be parsed by the respective formatter during schema inference (inferSchema). When used in conjunction with a user-provided schema, attempts parse timestamp columns as dates using dateFormat if they fail to conform to timestampFormat, the parsed values will be cast to timestamp type afterwards. + Attempts to infer string columns as Date if the values satisfy dateFormat option and failed to be parsed by the respective formatter during schema inference (inferSchema). When used in conjunction with a user-provided schema, attempts to parse timestamp columns as dates using dateFormat if they fail to conform to timestampFormat, the parsed values will be cast to timestamp type afterwards. read From ef445be15b9f71c5e0c51e2e2d8f70e4b6b537af Mon Sep 17 00:00:00 2001 From: Ivan Sadikov Date: Wed, 3 Aug 2022 11:13:00 +1200 Subject: [PATCH 7/7] update documentation --- docs/sql-data-sources-csv.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-data-sources-csv.md b/docs/sql-data-sources-csv.md index 0552f4393998..98d31a59ac7a 100644 --- a/docs/sql-data-sources-csv.md +++ b/docs/sql-data-sources-csv.md @@ -111,7 +111,7 @@ Data source options of CSV can be set via: prefersDate false - Attempts to infer string columns as Date if the values satisfy dateFormat option and failed to be parsed by the respective formatter during schema inference (inferSchema). When used in conjunction with a user-provided schema, attempts to parse timestamp columns as dates using dateFormat if they fail to conform to timestampFormat, the parsed values will be cast to timestamp type afterwards. + During schema inference (inferSchema), attempts to infer string columns that contain dates or timestamps as Date if the values satisfy the dateFormat option and failed to be parsed by the respective formatter. With a user-provided schema, attempts to parse timestamp columns as dates using dateFormat if they fail to conform to timestampFormat, in this case the parsed values will be cast to timestamp type afterwards. read