-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-39002][SQL] StringEndsWith/Contains support push down to Parquet #36328
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
e1a9def
e97cd22
45e13b6
4ece85e
4c4e2c8
e1a263d
2a34968
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -959,6 +959,15 @@ object SQLConf { | |
| .booleanConf | ||
| .createWithDefault(true) | ||
|
|
||
| val PARQUET_FILTER_PUSHDOWN_STRING_PREDICATE_ENABLED = | ||
| buildConf("spark.sql.parquet.filterPushdown.stringPredicate") | ||
| .doc("If true, enables Parquet filter push-down optimization for string predicate such " + | ||
| "as startsWith/endsWith/contains function. This configuration only has an effect when " + | ||
| "'${PARQUET_FILTER_PUSHDOWN_ENABLED.key}' is enabled.") | ||
| .version("3.3.0") | ||
|
||
| .internal() | ||
| .fallbackConf(PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED) | ||
|
|
||
| val PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD = | ||
| buildConf("spark.sql.parquet.pushdown.inFilterThreshold") | ||
| .doc("For IN predicate, Parquet filter will push-down a set of OR clauses if its " + | ||
|
|
@@ -4050,8 +4059,8 @@ class SQLConf extends Serializable with Logging { | |
|
|
||
| def parquetFilterPushDownDecimal: Boolean = getConf(PARQUET_FILTER_PUSHDOWN_DECIMAL_ENABLED) | ||
|
|
||
| def parquetFilterPushDownStringStartWith: Boolean = | ||
| getConf(PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED) | ||
| def parquetFilterPushDownStringPredicate: Boolean = | ||
| getConf(PARQUET_FILTER_PUSHDOWN_STRING_PREDICATE_ENABLED) | ||
|
|
||
| def parquetFilterPushDownInFilterThreshold: Int = | ||
| getConf(PARQUET_FILTER_PUSHDOWN_INFILTERTHRESHOLD) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,7 +48,7 @@ class ParquetFilters( | |
| pushDownDate: Boolean, | ||
| pushDownTimestamp: Boolean, | ||
| pushDownDecimal: Boolean, | ||
| pushDownStartWith: Boolean, | ||
| pushDownStringPredicate: Boolean, | ||
| pushDownInFilterThreshold: Int, | ||
| caseSensitive: Boolean, | ||
| datetimeRebaseSpec: RebaseSpec) { | ||
|
|
@@ -747,7 +747,7 @@ class ParquetFilters( | |
| } | ||
|
|
||
| case sources.StringStartsWith(name, prefix) | ||
| if pushDownStartWith && canMakeFilterOn(name, prefix) => | ||
| if pushDownStringPredicate && canMakeFilterOn(name, prefix) => | ||
| Option(prefix).map { v => | ||
| FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldNames), | ||
| new UserDefinedPredicate[Binary] with Serializable { | ||
|
|
@@ -778,6 +778,38 @@ class ParquetFilters( | |
| ) | ||
| } | ||
|
|
||
| case sources.StringEndsWith(name, prefix) | ||
| if pushDownStringPredicate && canMakeFilterOn(name, prefix) => | ||
|
||
| Option(prefix).map { v => | ||
| FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldNames), | ||
| new UserDefinedPredicate[Binary] with Serializable { | ||
| private val strToBinary = Binary.fromReusedByteArray(v.getBytes) | ||
|
||
| override def canDrop(statistics: Statistics[Binary]): Boolean = false | ||
| override def inverseCanDrop(statistics: Statistics[Binary]): Boolean = false | ||
| override def keep(value: Binary): Boolean = { | ||
| value != null && UTF8String.fromBytes(value.getBytes).endsWith( | ||
| UTF8String.fromBytes(strToBinary.getBytes)) | ||
| } | ||
| } | ||
| ) | ||
| } | ||
|
|
||
| case sources.StringContains(name, value) | ||
| if pushDownStringPredicate && canMakeFilterOn(name, value) => | ||
|
||
| Option(value).map { v => | ||
| FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldNames), | ||
| new UserDefinedPredicate[Binary] with Serializable { | ||
| private val strToBinary = Binary.fromReusedByteArray(v.getBytes) | ||
| override def canDrop(statistics: Statistics[Binary]): Boolean = false | ||
| override def inverseCanDrop(statistics: Statistics[Binary]): Boolean = false | ||
| override def keep(value: Binary): Boolean = { | ||
| value != null && UTF8String.fromBytes(value.getBytes).contains( | ||
| UTF8String.fromBytes(strToBinary.getBytes)) | ||
| } | ||
| } | ||
| ) | ||
| } | ||
|
|
||
| case _ => None | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -81,7 +81,7 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared | |||
| datetimeRebaseSpec: RebaseSpec = RebaseSpec(LegacyBehaviorPolicy.CORRECTED) | ||||
| ): ParquetFilters = | ||||
| new ParquetFilters(schema, conf.parquetFilterPushDownDate, conf.parquetFilterPushDownTimestamp, | ||||
| conf.parquetFilterPushDownDecimal, conf.parquetFilterPushDownStringStartWith, | ||||
| conf.parquetFilterPushDownDecimal, conf.parquetFilterPushDownStringPredicate, | ||||
| conf.parquetFilterPushDownInFilterThreshold, | ||||
| caseSensitive.getOrElse(conf.caseSensitiveAnalysis), | ||||
| datetimeRebaseSpec) | ||||
|
|
@@ -1934,6 +1934,43 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared | |||
| checkAnswer(notIn, Seq()) | ||||
| } | ||||
| } | ||||
|
|
||||
| private def testStringPredicateWithDictionaryFilter( | ||||
| dataFrame: DataFrame, filter: String): Unit = { | ||||
| Seq(true, false).foreach { enableDictionary => | ||||
| withTempPath { dir => | ||||
| val path = dir.getCanonicalPath | ||||
| dataFrame.write | ||||
| .option(ParquetOutputFormat.ENABLE_DICTIONARY, enableDictionary) | ||||
| .parquet(path) | ||||
| Seq(true, false).foreach { pushDown => | ||||
| withSQLConf( | ||||
| SQLConf.PARQUET_FILTER_PUSHDOWN_STRING_PREDICATE_ENABLED.key -> pushDown.toString) { | ||||
| val accu = new NumRowGroupsAcc | ||||
| sparkContext.register(accu) | ||||
|
|
||||
| val df = spark.read.parquet(path).filter(filter) | ||||
| df.foreachPartition((it: Iterator[Row]) => it.foreach(v => accu.add(0))) | ||||
| if (enableDictionary && pushDown) { | ||||
| assert(accu.value == 0) | ||||
| } else { | ||||
| assert(accu.value > 0) | ||||
| } | ||||
|
|
||||
| AccumulatorContext.remove(accu.id) | ||||
| } | ||||
| } | ||||
| } | ||||
| } | ||||
| } | ||||
|
|
||||
| test("filter pushdown - StringEndsWith/Contains") { | ||||
|
||||
| test("filter pushdown - StringStartsWith") { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I mean this test specially does test with testStringPredicateWithDictionaryFilter, but I don't see StringStartsWith is included here. Don't we need to do testStringPredicateWithDictionaryFilter for it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we merge the existing startswith test into the new one?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since
spark.sql.parquet.filterPushdown.string.startsWithis internal why not replacing it?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm afraid exising users who have already use it.