-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25559][SQL] Remove the unsupported predicates in Parquet when possible #22574
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
8c76e31
7792e99
29a9aa0
d37b2e8
9a9e47f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -394,7 +394,13 @@ private[parquet] class ParquetFilters( | |
| */ | ||
| def createFilter(schema: MessageType, predicate: sources.Filter): Option[FilterPredicate] = { | ||
| val nameToParquetField = getFieldMap(schema) | ||
| createFilterHelper(nameToParquetField, predicate, canRemoveOneSideInAnd = true) | ||
| } | ||
|
|
||
| private def createFilterHelper( | ||
| nameToParquetField: Map[String, ParquetField], | ||
| predicate: sources.Filter, | ||
| canRemoveOneSideInAnd: Boolean): Option[FilterPredicate] = { | ||
| // Decimal type must make sure that filter value's scale matched the file. | ||
| // If doesn't matched, which would cause data corruption. | ||
| def isDecimalMatched(value: Any, decimalMeta: DecimalMetadata): Boolean = value match { | ||
|
|
@@ -488,26 +494,25 @@ private[parquet] class ParquetFilters( | |
| .map(_(nameToParquetField(name).fieldName, value)) | ||
|
|
||
| case sources.And(lhs, rhs) => | ||
| // At here, it is not safe to just convert one side if we do not understand the | ||
| // other side. Here is an example used to explain the reason. | ||
| // Let's say we have NOT(a = 2 AND b in ('1')) and we do not understand how to | ||
| // convert b in ('1'). If we only convert a = 2, we will end up with a filter | ||
| // NOT(a = 2), which will generate wrong results. | ||
| // Pushing one side of AND down is only safe to do at the top level. | ||
| // You can see ParquetRelation's initializeLocalJobFunc method as an example. | ||
| for { | ||
| lhsFilter <- createFilter(schema, lhs) | ||
| rhsFilter <- createFilter(schema, rhs) | ||
| } yield FilterApi.and(lhsFilter, rhsFilter) | ||
| // If the unsupported predicate is in the top level `And` condition or in the child | ||
| // `And` condition before hitting `Not` or `Or` condition, it can be safely removed. | ||
| (createFilterHelper(nameToParquetField, lhs, canRemoveOneSideInAnd = true), | ||
|
||
| createFilterHelper(nameToParquetField, rhs, canRemoveOneSideInAnd = true)) match { | ||
|
||
| case (Some(lhsFilter), Some(rhsFilter)) => Some(FilterApi.and(lhsFilter, rhsFilter)) | ||
| case (Some(lhsFilter), None) if canRemoveOneSideInAnd => Some(lhsFilter) | ||
| case (None, Some(rhsFilter)) if canRemoveOneSideInAnd => Some(rhsFilter) | ||
| case _ => None | ||
| } | ||
|
|
||
| case sources.Or(lhs, rhs) => | ||
| for { | ||
| lhsFilter <- createFilter(schema, lhs) | ||
| rhsFilter <- createFilter(schema, rhs) | ||
| lhsFilter <- createFilterHelper(nameToParquetField, lhs, canRemoveOneSideInAnd = false) | ||
| rhsFilter <- createFilterHelper(nameToParquetField, rhs, canRemoveOneSideInAnd = false) | ||
| } yield FilterApi.or(lhsFilter, rhsFilter) | ||
|
|
||
| case sources.Not(pred) => | ||
| createFilter(schema, pred).map(FilterApi.not) | ||
| createFilterHelper(nameToParquetField, pred, canRemoveOneSideInAnd = false) | ||
| .map(FilterApi.not) | ||
|
|
||
| case sources.In(name, values) if canMakeFilterOn(name, values.head) | ||
| && values.distinct.length <= pushDownInFilterThreshold => | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This example is good to show the cases we can't remove one side. Can we still keep it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
addressed and added more tests.