-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-33098][SQL] Avoid MetaException by not pushing down partition filters with incompatible types #30207
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
[SPARK-33098][SQL] Avoid MetaException by not pushing down partition filters with incompatible types #30207
Changes from all commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
4cc1976
First attempt
bersprockets e184352
Fix test
bersprockets 704d565
Handle inset as well
bersprockets e4bd24f
Add some tests
bersprockets c6cb7d2
Add additional tests
bersprockets 80b0006
Add additional test
bersprockets 4b77288
Add another case
bersprockets f85d965
Remove debugging log messages
bersprockets ac73ebd
Style fix
bersprockets b89a6aa
Put back some empty lines
bersprockets f0bafe9
Small fix; fix test
bersprockets a186127
Don't throw away all filters with mismatched datatypes
bersprockets File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,7 +47,7 @@ import org.apache.spark.sql.catalyst.analysis.NoSuchPermanentFunctionException | |
| import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, CatalogTablePartition, CatalogUtils, FunctionResource, FunctionResourceType} | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types.{AtomicType, IntegralType, StringType} | ||
| import org.apache.spark.sql.types.{AtomicType, ByteType, DataType, IntegerType, IntegralType, LongType, ShortType, StringType} | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
| import org.apache.spark.util.Utils | ||
|
|
||
|
|
@@ -646,16 +646,16 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { | |
| } | ||
|
|
||
| object ExtractableLiteral { | ||
| def unapply(expr: Expression): Option[String] = expr match { | ||
| def unapply(expr: Expression): Option[(String, DataType)] = expr match { | ||
| case Literal(null, _) => None // `null`s can be cast as other types; we want to avoid NPEs. | ||
| case Literal(value, _: IntegralType) => Some(value.toString) | ||
| case Literal(value, _: StringType) => Some(quoteStringLiteral(value.toString)) | ||
| case l @ Literal(value, _: IntegralType) => Some(value.toString, l.dataType) | ||
| case Literal(value, _: StringType) => Some(quoteStringLiteral(value.toString), StringType) | ||
| case _ => None | ||
| } | ||
| } | ||
|
|
||
| object ExtractableLiterals { | ||
| def unapply(exprs: Seq[Expression]): Option[Seq[String]] = { | ||
| def unapply(exprs: Seq[Expression]): Option[Seq[(String, DataType)]] = { | ||
| // SPARK-24879: The Hive metastore filter parser does not support "null", but we still want | ||
| // to push down as many predicates as we can while still maintaining correctness. | ||
| // In SQL, the `IN` expression evaluates as follows: | ||
|
|
@@ -682,15 +682,15 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { | |
| } | ||
|
|
||
| object ExtractableValues { | ||
| private lazy val valueToLiteralString: PartialFunction[Any, String] = { | ||
| case value: Byte => value.toString | ||
| case value: Short => value.toString | ||
| case value: Int => value.toString | ||
| case value: Long => value.toString | ||
| case value: UTF8String => quoteStringLiteral(value.toString) | ||
| private lazy val valueToLiteralString: PartialFunction[Any, (String, DataType)] = { | ||
| case value: Byte => (value.toString, ByteType) | ||
| case value: Short => (value.toString, ShortType) | ||
| case value: Int => (value.toString, IntegerType) | ||
| case value: Long => (value.toString, LongType) | ||
| case value: UTF8String => (quoteStringLiteral(value.toString), StringType) | ||
| } | ||
|
|
||
| def unapply(values: Set[Any]): Option[Seq[String]] = { | ||
| def unapply(values: Set[Any]): Option[Seq[(String, DataType)]] = { | ||
| val extractables = values.toSeq.map(valueToLiteralString.lift) | ||
| if (extractables.nonEmpty && extractables.forall(_.isDefined)) { | ||
| Some(extractables.map(_.get)) | ||
|
|
@@ -726,33 +726,79 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { | |
| val useAdvanced = SQLConf.get.advancedPartitionPredicatePushdownEnabled | ||
|
|
||
| object ExtractAttribute { | ||
| def unapply(expr: Expression): Option[Attribute] = { | ||
| def unapply(expr: Expression): Option[(Attribute, DataType)] = { | ||
| expr match { | ||
| case attr: Attribute => Some(attr) | ||
| case attr: Attribute => Some(attr, attr.dataType) | ||
| case Cast(child @ AtomicType(), dt: AtomicType, _) | ||
| if Cast.canUpCast(child.dataType.asInstanceOf[AtomicType], dt) => unapply(child) | ||
| case _ => None | ||
| } | ||
| } | ||
| } | ||
|
|
||
| def compatibleTypes(dt1: Any, dt2: Any): Boolean = | ||
| (dt1, dt2) match { | ||
| case (_: IntegralType, _: IntegralType) => true | ||
| case (_: StringType, _: StringType) => true | ||
| case _ => false | ||
| } | ||
|
|
||
| def compatibleTypesIn(dt1: Any, dts: Seq[Any]): Boolean = { | ||
| dts.forall(compatibleTypes(dt1, _)) | ||
| } | ||
|
|
||
| def fixValue(quotedValue: String, desiredType: DataType): Option[Any] = try { | ||
| val value = quotedValue.init.tail // remove leading and trailing quotes | ||
| desiredType match { | ||
| case LongType => | ||
| Some(value.toLong) | ||
| case IntegerType => | ||
| Some(value.toInt) | ||
| case ShortType => | ||
| Some(value.toShort) | ||
| case ByteType => | ||
| Some(value.toByte) | ||
| } | ||
| } catch { | ||
| case _: NumberFormatException => | ||
| None | ||
| } | ||
|
|
||
| def convert(expr: Expression): Option[String] = expr match { | ||
| case In(ExtractAttribute(SupportedAttribute(name)), ExtractableLiterals(values)) | ||
| if useAdvanced => | ||
| case In(ExtractAttribute(SupportedAttribute(name), dt1), ExtractableLiterals(valsAndDts)) | ||
| if useAdvanced && compatibleTypesIn(dt1, valsAndDts.map(_._2)) => | ||
| val values = valsAndDts.map(_._1) | ||
| Some(convertInToOr(name, values)) | ||
|
|
||
| case InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values)) | ||
| if useAdvanced => | ||
| case InSet(ExtractAttribute(SupportedAttribute(name), dt1), ExtractableValues(valsAndDts)) | ||
| if useAdvanced && compatibleTypesIn(dt1, valsAndDts.map(_._2)) => | ||
| val values = valsAndDts.map(_._1) | ||
| Some(convertInToOr(name, values)) | ||
|
|
||
| case op @ SpecialBinaryComparison( | ||
| ExtractAttribute(SupportedAttribute(name)), ExtractableLiteral(value)) => | ||
| ExtractAttribute(SupportedAttribute(name), dt1), ExtractableLiteral(value, dt2)) | ||
| if compatibleTypes(dt1, dt2) => | ||
| Some(s"$name ${op.symbol} $value") | ||
|
|
||
| case op @ SpecialBinaryComparison( | ||
| ExtractableLiteral(value), ExtractAttribute(SupportedAttribute(name))) => | ||
| ExtractAttribute(SupportedAttribute(name), dt1), ExtractableLiteral(rawValue, dt2)) | ||
| if dt1.isInstanceOf[IntegralType] && dt2.isInstanceOf[StringType] => | ||
| fixValue(rawValue, dt1).map { value => | ||
| s"$name ${op.symbol} $value" | ||
| } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't have an equivalent "attempt to correct" for In and Inset, just for binary comparisons. In the case of In and Inset, if the datatypes are not compatible, I just drop the filter (which is what would have happened before SPARK-22384) |
||
|
|
||
| case op @ SpecialBinaryComparison( | ||
| ExtractableLiteral(value, dt2), ExtractAttribute(SupportedAttribute(name), dt1)) | ||
| if compatibleTypes(dt1, dt2) => | ||
| Some(s"$value ${op.symbol} $name") | ||
|
|
||
| case op @ SpecialBinaryComparison( | ||
| ExtractableLiteral(rawValue, dt2), ExtractAttribute(SupportedAttribute(name), dt1)) | ||
| if dt1.isInstanceOf[IntegralType] && dt2.isInstanceOf[StringType] => | ||
| fixValue(rawValue, dt1).map { value => | ||
| s"$value ${op.symbol} $name" | ||
| } | ||
|
|
||
| case And(expr1, expr2) if useAdvanced => | ||
| val converted = convert(expr1) ++ convert(expr2) | ||
| if (converted.isEmpty) { | ||
|
|
@@ -795,6 +841,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { | |
|
|
||
| val partitions = | ||
| if (filter.isEmpty) { | ||
| logDebug(s"Falling back to getting all partitions") | ||
| getAllPartitionsMethod.invoke(hive, table).asInstanceOf[JSet[Partition]] | ||
| } else { | ||
| logDebug(s"Hive metastore filter is '$filter'.") | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm, will this change semantics? suppose we have
cast(b as string) < '012'wherebis 11. Before the conversion this will evaluate to false but after it will evaluate to true.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it should probably ignore any literal strings with leading zeros.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Whatever makes sense. There is some (long-time) ongoing work with TypeCoercion (#22038) that fixes a few of these cases. But if if that goes through and we can close the gap with the others, that would be fine. I am probably not in a position to provide much help in the optimizer code (at this point).