-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-37326][SQL] Support TimestampNTZ in CSV data source #34596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7e73e59
6bfc1b1
ea47b94
e439ae2
7eca451
1d25ac7
b718db1
014cd20
309643b
f59ba6c
662460f
043edb6
1edef2d
feb3715
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.ExprUtils | |
| import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT | ||
| import org.apache.spark.sql.catalyst.util.TimestampFormatter | ||
| import org.apache.spark.sql.errors.QueryExecutionErrors | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
| class CSVInferSchema(val options: CSVOptions) extends Serializable { | ||
|
|
@@ -38,6 +39,13 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { | |
| legacyFormat = FAST_DATE_FORMAT, | ||
| isParsing = true) | ||
|
|
||
| private val timestampNTZFormatter = TimestampFormatter( | ||
| options.timestampNTZFormatInRead, | ||
| options.zoneId, | ||
| legacyFormat = FAST_DATE_FORMAT, | ||
| isParsing = true, | ||
| forTimestampNTZ = true) | ||
|
|
||
| private val decimalParser = if (options.locale == Locale.US) { | ||
| // Special handling the default locale for backward compatibility | ||
| s: String => new java.math.BigDecimal(s) | ||
|
|
@@ -109,6 +117,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { | |
| case LongType => tryParseLong(field) | ||
| case _: DecimalType => tryParseDecimal(field) | ||
| case DoubleType => tryParseDouble(field) | ||
| case TimestampNTZType => tryParseTimestampNTZ(field) | ||
| case TimestampType => tryParseTimestamp(field) | ||
| case BooleanType => tryParseBoolean(field) | ||
| case StringType => StringType | ||
|
|
@@ -160,6 +169,17 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { | |
| private def tryParseDouble(field: String): DataType = { | ||
| if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) { | ||
| DoubleType | ||
| } else { | ||
| tryParseTimestampNTZ(field) | ||
sadikovi marked this conversation as resolved.
Show resolved
Hide resolved
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not working on JSON just yet, this is the CSV data source.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I showed JSON as an example because I guess CSV has similar issue. |
||
| } | ||
| } | ||
|
|
||
| private def tryParseTimestampNTZ(field: String): DataType = { | ||
| // We can only parse the value as TimestampNTZType if it does not have zone-offset or | ||
| // time-zone component and can be parsed with the timestamp formatter. | ||
| // Otherwise, it is likely to be a timestamp with timezone. | ||
| if ((allCatch opt timestampNTZFormatter.parseWithoutTimeZone(field, true)).isDefined) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should maybe we skip the parsing if
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you elaborate a bit more? Thanks. My understanding was that the config indicated whether the output of parsing should be treated as TimestampNTZ or TimestampLTZ. |
||
| SQLConf.get.timestampType | ||
| } else { | ||
| tryParseTimestamp(field) | ||
| } | ||
|
|
@@ -225,6 +245,10 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { | |
| } else { | ||
| Some(DecimalType(range + scale, scale)) | ||
| } | ||
|
|
||
| case (TimestampNTZType, TimestampType) | (TimestampType, TimestampNTZType) => | ||
| Some(TimestampType) | ||
|
|
||
| case _ => None | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -164,6 +164,10 @@ class CSVOptions( | |
| s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS][XXX]" | ||
| }) | ||
|
|
||
| val timestampNTZFormatInRead: Option[String] = parameters.get("timestampNTZFormat") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How do you mean? Actually, it is the same config but reads require an
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we are not going to respect the conf
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated!
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gengliangwang I had to revert to using |
||
| val timestampNTZFormatInWrite: String = parameters.getOrElse("timestampNTZFormat", | ||
| s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS]") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder what's the reason to have the optional field Another question, why the precision is in milliseconds but not in microseconds?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is the same reason as for timestampFormat above, I just copy-pasted it for timestampNTZFormat and removed the zone component.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The option was added when Spark's timestamp type had milliseconds precision. Now the precision is microsecond, and don't see any reasons to loose info in write, by default.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can change it for both LTZ and NTZ, if the precision lose here is a problem.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rather change it separately, sounds more like a general problem with timestamps. |
||
|
|
||
| val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false) | ||
|
|
||
| val maxColumns = getInt("maxColumns", 20480) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -442,17 +442,22 @@ object DateTimeUtils { | |
|
|
||
| /** | ||
| * Trims and parses a given UTF8 string to a corresponding [[Long]] value which representing the | ||
| * number of microseconds since the epoch. The result is independent of time zones, | ||
| * which means that zone ID in the input string will be ignored. | ||
| * number of microseconds since the epoch. The result will be independent of time zones. | ||
| * | ||
| * If the input string contains a component associated with time zone, the method will return | ||
| * `None` if `failOnError` is set to `true`. If `failOnError` is set to `false`, the method | ||
| * will simply discard the time zone component. Enable the check to detect situations like parsing | ||
| * a timestamp with time zone as TimestampNTZType. | ||
| * | ||
| * The return type is [[Option]] in order to distinguish between 0L and null. Please | ||
| * refer to `parseTimestampString` for the allowed formats. | ||
| */ | ||
| def stringToTimestampWithoutTimeZone(s: UTF8String): Option[Long] = { | ||
| def stringToTimestampWithoutTimeZone(s: UTF8String, failOnError: Boolean): Option[Long] = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The name
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Respectfully, I have changed this exact code and variable 3 times by now. Yes, I will update 😞. |
||
| try { | ||
| val (segments, _, justTime) = parseTimestampString(s) | ||
| // If the input string can't be parsed as a timestamp, or it contains only the time part of a | ||
| // timestamp and we can't determine its date, return None. | ||
| if (segments.isEmpty || justTime) { | ||
| val (segments, zoneIdOpt, justTime) = parseTimestampString(s) | ||
| // If the input string can't be parsed as a timestamp without time zone, or it contains only | ||
| // the time part of a timestamp and we can't determine its date, return None. | ||
| if (segments.isEmpty || justTime || failOnError && zoneIdOpt.isDefined) { | ||
| return None | ||
| } | ||
| val nanoseconds = MICROSECONDS.toNanos(segments(6)) | ||
|
|
@@ -465,8 +470,19 @@ object DateTimeUtils { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Trims and parses a given UTF8 string to a corresponding [[Long]] value which representing the | ||
| * number of microseconds since the epoch. The result is independent of time zones. Zone id | ||
| * component will be discarded and ignored. | ||
| * The return type is [[Option]] in order to distinguish between 0L and null. Please | ||
| * refer to `parseTimestampString` for the allowed formats. | ||
| */ | ||
| def stringToTimestampWithoutTimeZone(s: UTF8String): Option[Long] = { | ||
| stringToTimestampWithoutTimeZone(s, false) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used |
||
| } | ||
|
|
||
| def stringToTimestampWithoutTimeZoneAnsi(s: UTF8String): Long = { | ||
| stringToTimestampWithoutTimeZone(s).getOrElse { | ||
| stringToTimestampWithoutTimeZone(s, false).getOrElse { | ||
| throw QueryExecutionErrors.cannotCastToDateTimeError(s, TimestampNTZType) | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,9 +31,10 @@ import org.apache.spark.sql.catalyst.util.DateTimeConstants._ | |
| import org.apache.spark.sql.catalyst.util.DateTimeUtils._ | ||
| import org.apache.spark.sql.catalyst.util.LegacyDateFormats.{LegacyDateFormat, LENIENT_SIMPLE_DATE_FORMAT} | ||
| import org.apache.spark.sql.catalyst.util.RebaseDateTime._ | ||
| import org.apache.spark.sql.errors.QueryExecutionErrors | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy._ | ||
| import org.apache.spark.sql.types.Decimal | ||
| import org.apache.spark.sql.types.{Decimal, TimestampNTZType} | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
|
|
||
| sealed trait TimestampFormatter extends Serializable { | ||
|
|
@@ -55,6 +56,7 @@ sealed trait TimestampFormatter extends Serializable { | |
| * Parses a timestamp in a string and converts it to microseconds since Unix Epoch in local time. | ||
| * | ||
| * @param s - string with timestamp to parse | ||
| * @param failOnError - indicates strict parsing of timezone | ||
| * @return microseconds since epoch. | ||
| * @throws ParseException can be thrown by legacy parser | ||
| * @throws DateTimeParseException can be thrown by new parser | ||
|
|
@@ -66,10 +68,23 @@ sealed trait TimestampFormatter extends Serializable { | |
| @throws(classOf[DateTimeParseException]) | ||
| @throws(classOf[DateTimeException]) | ||
| @throws(classOf[IllegalStateException]) | ||
| def parseWithoutTimeZone(s: String): Long = | ||
| def parseWithoutTimeZone(s: String, failOnError: Boolean): Long = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto,
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another question is, instead of adding a new parameter, can the caller side pick one of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure we can do it, because |
||
| throw new IllegalStateException( | ||
| s"The method `parseWithoutTimeZone(s: String)` should be implemented in the formatter " + | ||
| "of timestamp without time zone") | ||
| s"The method `parseWithoutTimeZone(s: String, failOnError: Boolean)` should be " + | ||
| "implemented in the formatter of timestamp without time zone") | ||
|
|
||
| /** | ||
| * Parses a timestamp in a string and converts it to microseconds since Unix Epoch in local time. | ||
| * Zone-id and zone-offset components are ignored. | ||
| */ | ||
| @throws(classOf[ParseException]) | ||
| @throws(classOf[DateTimeParseException]) | ||
| @throws(classOf[DateTimeException]) | ||
| @throws(classOf[IllegalStateException]) | ||
| final def parseWithoutTimeZone(s: String): Long = | ||
| // This is implemented to adhere to the original behaviour of `parseWithoutTimeZone` where we | ||
| // did not fail if timestamp contained zone-id or zone-offset component and instead ignored it. | ||
| parseWithoutTimeZone(s, false) | ||
|
|
||
| def format(us: Long): String | ||
| def format(ts: Timestamp): String | ||
|
|
@@ -118,9 +133,12 @@ class Iso8601TimestampFormatter( | |
| } catch checkParsedDiff(s, legacyFormatter.parse) | ||
| } | ||
|
|
||
| override def parseWithoutTimeZone(s: String): Long = { | ||
| override def parseWithoutTimeZone(s: String, failOnError: Boolean): Long = { | ||
| try { | ||
| val parsed = formatter.parse(s) | ||
| if (failOnError && parsed.query(TemporalQueries.zone()) != null) { | ||
| throw QueryExecutionErrors.cannotParseStringAsDataTypeError(pattern, s, TimestampNTZType) | ||
| } | ||
| val localDate = toLocalDate(parsed) | ||
| val localTime = toLocalTime(parsed) | ||
| DateTimeUtils.localDateTimeToMicros(LocalDateTime.of(localDate, localTime)) | ||
|
|
@@ -186,9 +204,13 @@ class DefaultTimestampFormatter( | |
| } catch checkParsedDiff(s, legacyFormatter.parse) | ||
| } | ||
|
|
||
| override def parseWithoutTimeZone(s: String): Long = { | ||
| override def parseWithoutTimeZone(s: String, failOnError: Boolean): Long = { | ||
| try { | ||
| DateTimeUtils.stringToTimestampWithoutTimeZoneAnsi(UTF8String.fromString(s)) | ||
| val utf8Value = UTF8String.fromString(s) | ||
| DateTimeUtils.stringToTimestampWithoutTimeZone(utf8Value, failOnError).getOrElse { | ||
| throw QueryExecutionErrors.cannotParseStringAsDataTypeError( | ||
| TimestampFormatter.defaultPattern(), s, TimestampNTZType) | ||
| } | ||
| } catch checkParsedDiff(s, legacyFormatter.parse) | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -368,4 +368,15 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession { | |
| .selectExpr("value.a") | ||
| checkAnswer(fromCsvDF, Row(localDT)) | ||
| } | ||
|
|
||
| test("SPARK-37326: Handle incorrectly formatted timestamp_ntz values in from_csv") { | ||
| val fromCsvDF = Seq("2021-08-12T15:16:23.000+11:00").toDF("csv") | ||
| .select( | ||
| from_csv( | ||
| $"csv", | ||
| StructType(StructField("a", TimestampNTZType) :: Nil), | ||
| Map.empty[String, String]) as "value") | ||
| .selectExpr("value.a") | ||
| checkAnswer(fromCsvDF, Row(null)) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about positive test for the functions
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a test right above that verifies the happy path for those functions. I only added a new test because it was missing from the original patch. |
||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this part I'd defer to @MaxGekk to review.