-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-17091][SQL] Add rule to convert IN predicate to equivalent Parquet filter #21603
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
264eed8
4f96881
b9b3160
d57f44c
8218596
fdb79b3
ec96b0f
5e748f9
f5fba0e
c386e02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -378,6 +378,17 @@ object SQLConf { | |
| .booleanConf | ||
| .createWithDefault(true) | ||
|
|
||
| val PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD = | ||
| buildConf("spark.sql.parquet.pushdown.inFilterThreshold") | ||
| .doc("The maximum number of values to filter push-down optimization for IN predicate. " + | ||
| "Large threshold won't necessarily provide much better performance. " + | ||
| "The experiment argued that 300 is the limit threshold. " + | ||
| "This configuration only has an effect when 'spark.sql.parquet.filterPushdown' is enabled.") | ||
| .internal() | ||
| .intConf | ||
| .checkValue(threshold => threshold > 0, "The threshold must be greater than 0.") | ||
|
||
| .createWithDefault(10) | ||
|
|
||
| val PARQUET_WRITE_LEGACY_FORMAT = buildConf("spark.sql.parquet.writeLegacyFormat") | ||
| .doc("Whether to be compatible with the legacy Parquet format adopted by Spark 1.4 and prior " + | ||
| "versions, when converting Parquet schema to Spark SQL schema and vice versa.") | ||
|
|
@@ -1420,6 +1431,9 @@ class SQLConf extends Serializable with Logging { | |
|
|
||
| def parquetFilterPushDownDate: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_DATE_ENABLED) | ||
|
|
||
| def parquetFilterPushDownInFilterThreshold: Int = | ||
| getConf(PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD) | ||
|
|
||
| def orcFilterPushDown: Boolean = getConf(ORC_FILTER_PUSHDOWN_ENABLED) | ||
|
|
||
| def verifyPartitionPath: Boolean = getConf(HIVE_VERIFY_PARTITION_PATH) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,13 +25,19 @@ import org.apache.parquet.io.api.Binary | |
|
|
||
| import org.apache.spark.sql.catalyst.util.DateTimeUtils | ||
| import org.apache.spark.sql.catalyst.util.DateTimeUtils.SQLDate | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.sources | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
| /** | ||
| * Some utility function to convert Spark data source filters to Parquet filters. | ||
| */ | ||
| private[parquet] class ParquetFilters(pushDownDate: Boolean) { | ||
| private[parquet] class ParquetFilters { | ||
|
|
||
| val sqlConf: SQLConf = SQLConf.get | ||
|
|
||
| val pushDownDate = sqlConf.parquetFilterPushDownDate | ||
| val pushDownInFilterThreshold = sqlConf.parquetFilterPushDownInFilterThreshold | ||
|
|
||
| private def dateToDays(date: Date): SQLDate = { | ||
| DateTimeUtils.fromJavaDate(date) | ||
|
|
@@ -270,6 +276,12 @@ private[parquet] class ParquetFilters(pushDownDate: Boolean) { | |
| case sources.Not(pred) => | ||
| createFilter(schema, pred).map(FilterApi.not) | ||
|
|
||
| case sources.In(name, values) | ||
| if canMakeFilterOn(name) && values.distinct.length <= pushDownInFilterThreshold => | ||
| values.distinct.flatMap { v => | ||
| makeEq.lift(nameToType(name)).map(_(name, v)) | ||
| }.reduceLeftOption(FilterApi.or) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what about null handling? Do we get the same result as before? Anyway, can we add a test for it? |
||
|
|
||
| case _ => None | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources.parquet | |
| import java.nio.charset.StandardCharsets | ||
| import java.sql.Date | ||
|
|
||
| import org.apache.parquet.filter2.predicate.{FilterPredicate, Operators} | ||
| import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate, Operators} | ||
| import org.apache.parquet.filter2.predicate.FilterApi._ | ||
| import org.apache.parquet.filter2.predicate.Operators.{Column => _, _} | ||
|
|
||
|
|
@@ -55,7 +55,7 @@ import org.apache.spark.util.{AccumulatorContext, AccumulatorV2} | |
| */ | ||
| class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContext { | ||
|
|
||
| private lazy val parquetFilters = new ParquetFilters(conf.parquetFilterPushDownDate) | ||
| private lazy val parquetFilters = new ParquetFilters() | ||
|
|
||
| override def beforeEach(): Unit = { | ||
| super.beforeEach() | ||
|
|
@@ -660,6 +660,64 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex | |
| assert(df.where("col > 0").count() === 2) | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-17091: Convert IN predicate to Parquet filter push-down") { | ||
| val schema = StructType(Seq( | ||
| StructField("a", IntegerType, nullable = false) | ||
| )) | ||
|
|
||
| assertResult(Some(FilterApi.eq(intColumn("a"), null: Integer))) { | ||
| parquetFilters.createFilter(schema, sources.In("a", Array(null))) | ||
| } | ||
|
|
||
| assertResult(Some(FilterApi.eq(intColumn("a"), 10: Integer))) { | ||
| parquetFilters.createFilter(schema, sources.In("a", Array(10))) | ||
| } | ||
|
|
||
| // Remove duplicates | ||
| assertResult(Some(FilterApi.eq(intColumn("a"), 10: Integer))) { | ||
| parquetFilters.createFilter(schema, sources.In("a", Array(10, 10))) | ||
| } | ||
|
|
||
| assertResult(Some(or( | ||
|
||
| FilterApi.eq(intColumn("a"), 10: Integer), | ||
| FilterApi.eq(intColumn("a"), 20: Integer))) | ||
| ) { | ||
| parquetFilters.createFilter(schema, sources.In("a", Array(10, 20))) | ||
| } | ||
|
|
||
| assertResult(Some(or(or( | ||
| FilterApi.eq(intColumn("a"), 10: Integer), | ||
| FilterApi.eq(intColumn("a"), 20: Integer)), | ||
| FilterApi.eq(intColumn("a"), 30: Integer))) | ||
| ) { | ||
| parquetFilters.createFilter(schema, sources.In("a", Array(10, 20, 30))) | ||
| } | ||
|
|
||
| assert(parquetFilters.createFilter(schema, sources.In("a", | ||
| Range(0, conf.parquetFilterPushDownInFilterThreshold).toArray)).isDefined) | ||
| assert(parquetFilters.createFilter(schema, sources.In("a", | ||
| Range(0, conf.parquetFilterPushDownInFilterThreshold + 1).toArray)).isEmpty) | ||
|
|
||
| import testImplicits._ | ||
| withTempPath { path => | ||
| (0 to 1024).toDF("a").selectExpr("if (a = 1024, null, a) AS a") // convert 1024 to null | ||
| .coalesce(1).write.option("parquet.block.size", 512) | ||
| .parquet(path.getAbsolutePath) | ||
| val df = spark.read.parquet(path.getAbsolutePath) | ||
| Seq(true, false).foreach { pushEnabled => | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @wangyum, does this really test the Parquet itself push down fine? I think you should
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated to: val actual = stripSparkFilter(df.where(filter)).collect().length
if (pushEnabled && count <= conf.parquetFilterPushDownInFilterThreshold) {
assert(actual > 1 && actual < data.length)
} else {
assert(actual === data.length)
} |
||
| withSQLConf( | ||
| SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> pushEnabled.toString) { | ||
| Seq(1, 5, 10, 11, 1000).foreach { count => | ||
| assert(df.where(s"a in(${Range(0, count).mkString(",")})").count() === count) | ||
| } | ||
| assert(df.where("a in(null)").count() === 0) | ||
| assert(df.where("a = null").count() === 0) | ||
| assert(df.where("a is null").count() === 1) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| class NumRowGroupsAcc extends AccumulatorV2[Integer, Integer] { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This also depends on the data types, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An interesting finding. Thanks for the update. Maybe you do not need to mention this limit threshold in the doc?
How about post your finding in the PR description?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See the comment #21603 (comment)