-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-30530][SQL] Fix filter pushdown for bad CSV records #27239
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -230,64 +230,55 @@ class UnivocityParser( | |||||||||||||||
| () => getCurrentInput, | ||||||||||||||||
| () => None, | ||||||||||||||||
| new RuntimeException("Malformed CSV record")) | ||||||||||||||||
| } else if (tokens.length != parsedSchema.length) { | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
| var checkedTokens = tokens | ||||||||||||||||
| var badRecordException: Option[Throwable] = None | ||||||||||||||||
|
|
||||||||||||||||
| if (tokens.length != parsedSchema.length) { | ||||||||||||||||
| // If the number of tokens doesn't match the schema, we should treat it as a malformed record. | ||||||||||||||||
| // However, we still have chance to parse some of the tokens, by adding extra null tokens in | ||||||||||||||||
| // the tail if the number is smaller, or by dropping extra tokens if the number is larger. | ||||||||||||||||
| val checkedTokens = if (parsedSchema.length > tokens.length) { | ||||||||||||||||
| checkedTokens = if (parsedSchema.length > tokens.length) { | ||||||||||||||||
| tokens ++ new Array[String](parsedSchema.length - tokens.length) | ||||||||||||||||
| } else { | ||||||||||||||||
| tokens.take(parsedSchema.length) | ||||||||||||||||
| } | ||||||||||||||||
| def getPartialResult(): Option[InternalRow] = { | ||||||||||||||||
| try { | ||||||||||||||||
| convert(checkedTokens).headOption | ||||||||||||||||
| } catch { | ||||||||||||||||
| case _: BadRecordException => None | ||||||||||||||||
| } | ||||||||||||||||
| } | ||||||||||||||||
| // For records with less or more tokens than the schema, tries to return partial results | ||||||||||||||||
| // if possible. | ||||||||||||||||
| throw BadRecordException( | ||||||||||||||||
| () => getCurrentInput, | ||||||||||||||||
| () => getPartialResult(), | ||||||||||||||||
| new RuntimeException("Malformed CSV record")) | ||||||||||||||||
| } else { | ||||||||||||||||
| // When the length of the returned tokens is identical to the length of the parsed schema, | ||||||||||||||||
| // we just need to: | ||||||||||||||||
| // 1. Convert the tokens that correspond to the required schema. | ||||||||||||||||
| // 2. Apply the pushdown filters to `requiredRow`. | ||||||||||||||||
| var i = 0 | ||||||||||||||||
| val row = requiredRow.head | ||||||||||||||||
| var skipRow = false | ||||||||||||||||
| var badRecordException: Option[Throwable] = None | ||||||||||||||||
| while (i < requiredSchema.length) { | ||||||||||||||||
| try { | ||||||||||||||||
| if (!skipRow) { | ||||||||||||||||
| row(i) = valueConverters(i).apply(getToken(tokens, i)) | ||||||||||||||||
| if (csvFilters.skipRow(row, i)) { | ||||||||||||||||
| skipRow = true | ||||||||||||||||
| } | ||||||||||||||||
| } | ||||||||||||||||
| if (skipRow) { | ||||||||||||||||
| row.setNullAt(i) | ||||||||||||||||
| badRecordException = Some(new RuntimeException("Malformed CSV record")) | ||||||||||||||||
| } | ||||||||||||||||
| // When the length of the returned tokens is identical to the length of the parsed schema, | ||||||||||||||||
| // we just need to: | ||||||||||||||||
| // 1. Convert the tokens that correspond to the required schema. | ||||||||||||||||
| // 2. Apply the pushdown filters to `requiredRow`. | ||||||||||||||||
| var i = 0 | ||||||||||||||||
| val row = requiredRow.head | ||||||||||||||||
| var skipRow = false | ||||||||||||||||
| while (i < requiredSchema.length) { | ||||||||||||||||
| try { | ||||||||||||||||
| if (!skipRow) { | ||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: if (skipRow) {
row.setNullAt(i)
} else {
row(i) = valueConverters(i).apply(getToken(tokens, i))
if (csvFilters.skipRow(row, i)) {
skipRow = true
}
} |
||||||||||||||||
| row(i) = valueConverters(i).apply(getToken(tokens, i)) | ||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if the first column is corrupted, and the predicate is
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are 3 cases:
New implementation with filters pushdown does not change the behavior in those cases. |
||||||||||||||||
| if (csvFilters.skipRow(row, i)) { | ||||||||||||||||
| skipRow = true | ||||||||||||||||
| } | ||||||||||||||||
| } catch { | ||||||||||||||||
| case NonFatal(e) => | ||||||||||||||||
| badRecordException = badRecordException.orElse(Some(e)) | ||||||||||||||||
| row.setNullAt(i) | ||||||||||||||||
| } | ||||||||||||||||
| i += 1 | ||||||||||||||||
| if (skipRow) { | ||||||||||||||||
| row.setNullAt(i) | ||||||||||||||||
| } | ||||||||||||||||
| } catch { | ||||||||||||||||
| case NonFatal(e) => | ||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previously we rely on |
||||||||||||||||
| badRecordException = badRecordException.orElse(Some(e)) | ||||||||||||||||
| row.setNullAt(i) | ||||||||||||||||
| } | ||||||||||||||||
| if (skipRow) { | ||||||||||||||||
| noRows | ||||||||||||||||
| i += 1 | ||||||||||||||||
| } | ||||||||||||||||
| if (skipRow) { | ||||||||||||||||
| noRows | ||||||||||||||||
| } else { | ||||||||||||||||
| if (badRecordException.isDefined) { | ||||||||||||||||
| throw BadRecordException( | ||||||||||||||||
| () => getCurrentInput, () => requiredRow.headOption, badRecordException.get) | ||||||||||||||||
| } else { | ||||||||||||||||
| if (badRecordException.isDefined) { | ||||||||||||||||
| throw BadRecordException( | ||||||||||||||||
| () => getCurrentInput, () => requiredRow.headOption, badRecordException.get) | ||||||||||||||||
| } else { | ||||||||||||||||
| requiredRow | ||||||||||||||||
| } | ||||||||||||||||
| requiredRow | ||||||||||||||||
| } | ||||||||||||||||
| } | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need this
checkedTokensnow?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems not. The
ifcan be replaced by:Let me do that in a follow up PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops, I already did at #27287. Let me address this comment there.