-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-40474][SQL] Correct CSV schema inference and data parsing behavior on columns with mixed dates and timestamps #37933
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
0394030
f4fadf7
813ac74
5c2dde8
0d2be1d
df56946
4bc480d
f6ed29f
93b6422
6942f2b
b4a6f1d
1502618
4767ae7
1f57098
e9150ec
a07e432
255aea3
533c487
be4c86f
c7225b1
9e87d6e
af66b83
812fa65
5288eb0
a2f0b80
00a8661
16e187c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -123,10 +123,8 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { | |
| case LongType => tryParseLong(field) | ||
| case _: DecimalType => tryParseDecimal(field) | ||
| case DoubleType => tryParseDouble(field) | ||
| case DateType => tryParseDateTime(field) | ||
| case TimestampNTZType if options.prefersDate => tryParseDateTime(field) | ||
| case DateType => tryParseDate(field) | ||
| case TimestampNTZType => tryParseTimestampNTZ(field) | ||
| case TimestampType if options.prefersDate => tryParseDateTime(field) | ||
| case TimestampType => tryParseTimestamp(field) | ||
| case BooleanType => tryParseBoolean(field) | ||
| case StringType => StringType | ||
|
|
@@ -179,13 +177,13 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { | |
| if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) { | ||
| DoubleType | ||
| } else if (options.prefersDate) { | ||
| tryParseDateTime(field) | ||
| tryParseDate(field) | ||
| } else { | ||
| tryParseTimestampNTZ(field) | ||
| } | ||
| } | ||
|
|
||
| private def tryParseDateTime(field: String): DataType = { | ||
| private def tryParseDate(field: String): DataType = { | ||
| if ((allCatch opt dateFormatter.parse(field)).isDefined) { | ||
| DateType | ||
| } else { | ||
|
|
@@ -233,7 +231,12 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { | |
| * is compatible with both input data types. | ||
| */ | ||
| private def compatibleType(t1: DataType, t2: DataType): Option[DataType] = { | ||
| TypeCoercion.findTightestCommonType(t1, t2).orElse(findCompatibleTypeForCSV(t1, t2)) | ||
| (t1, t2) match { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this match be in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What result does
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Never mind, I checked the code, the resulting type will be TimestampType or TimestampNTZ. |
||
| // For fields with mixing dates and timestamps, relax it as string type | ||
| case (DateType, TimestampType) | (TimestampType, DateType) | | ||
| (DateType, TimestampNTZType) | (TimestampNTZType, DateType) => Some(StringType) | ||
| case _ => TypeCoercion.findTightestCommonType(t1, t2).orElse(findCompatibleTypeForCSV(t1, t2)) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -149,13 +149,10 @@ class CSVOptions( | |||||
| val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US) | ||||||
|
|
||||||
| /** | ||||||
| * Infer columns with all valid date entries as date type (otherwise inferred as timestamp type) | ||||||
| * if schema inference is enabled. When being used with user-provided schema, tries to parse | ||||||
| * timestamp values as dates if the values do not conform to the timestamp formatter before | ||||||
| * falling back to the backward compatible parsing - the parsed values will be cast to timestamp | ||||||
| * afterwards. | ||||||
| * Infer columns with all valid date entries as date type (otherwise inferred as string type) | ||||||
|
||||||
| * Infer columns with all valid date entries as date type (otherwise inferred as string type) | |
| * Infer columns with all valid date entries as date type (otherwise inferred as string or timestamp type) |
Uh oh!
There was an error while loading. Please reload this page.