-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-3812] Fixing Data Skipping configuration to respect Metadata Table configs #5244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7ab435b
66c6a70
e53b673
55010fe
1e2e98f
e51ec65
562b737
56a4aed
8a886d1
7b7faea
cf0ecea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,11 +85,6 @@ case class HoodieFileIndex(spark: SparkSession, | |
|
|
||
| override def rootPaths: Seq[Path] = queryPaths.asScala | ||
|
|
||
| def isDataSkippingEnabled: Boolean = { | ||
| options.getOrElse(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), | ||
| spark.sessionState.conf.getConfString(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "false")).toBoolean | ||
| } | ||
|
|
||
| /** | ||
| * Returns the FileStatus for all the base files (excluding log files). This should be used only for | ||
| * cases where Spark directly fetches the list of files via HoodieFileIndex or for read optimized query logic | ||
|
|
@@ -196,12 +191,20 @@ case class HoodieFileIndex(spark: SparkSession, | |
| * @return list of pruned (data-skipped) candidate base-files' names | ||
| */ | ||
| private def lookupCandidateFilesInMetadataTable(queryFilters: Seq[Expression]): Try[Option[Set[String]]] = Try { | ||
| if (!isDataSkippingEnabled || queryFilters.isEmpty || !HoodieTableMetadataUtil.getCompletedMetadataPartitions(metaClient.getTableConfig) | ||
| .contains(HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS)) { | ||
| // NOTE: Data Skipping is only effective when it references columns that are indexed w/in | ||
| // the Column Stats Index (CSI). Following cases could not be effectively handled by Data Skipping: | ||
| // - Expressions on top-level column's fields (ie, for ex filters like "struct.field > 0", since | ||
| // CSI only contains stats for top-level columns, in this case for "struct") | ||
| // - Any expression not directly referencing top-level column (for ex, sub-queries, since there's | ||
| // nothing CSI in particular could be applied for) | ||
| lazy val queryReferencedColumns = collectReferencedColumns(spark, queryFilters, schema) | ||
|
|
||
| if (!isMetadataTableEnabled || !isColumnStatsIndexAvailable || !isDataSkippingEnabled) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. given that CSI does not have stats for top level columns, if predicate references both top level and non-top level columns, we gonna skip leveraging CSI is it? since anyways, for non top level column, we have to visit all data files?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, how do we deduce what columns have been indexed in MDT CSI? So, when we are looking to apply data skipping on the query side, should we check for these configs and decided whether a particular col is indexed by CSI or not ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It depends on the predicate, but we will at least try to leverage it to filter out for top-level columns only
We can't do that, we have to play by what's actually in index: this is handled when we execute the filter against lookup table -- if it doesn't contain the column of the filter, it will just match all of the files.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. However, your question made me realize that we're actually deriving index schema incorrectly currently. Let me address that
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nsivabalan i'm addressing this problem in a separate PR to avoid overloading this one: #5275 |
||
| validateConfig() | ||
| Option.empty | ||
| } else if (queryFilters.isEmpty || queryReferencedColumns.isEmpty) { | ||
| Option.empty | ||
| } else { | ||
| val queryReferencedColumns = collectReferencedColumns(spark, queryFilters, schema) | ||
|
|
||
| val colStatsDF: DataFrame = readColumnStatsIndex(spark, basePath, metadataConfig, queryReferencedColumns) | ||
|
|
||
| // Persist DF to avoid re-computing column statistics unraveling | ||
|
|
@@ -245,13 +248,27 @@ case class HoodieFileIndex(spark: SparkSession, | |
|
|
||
| override def refresh(): Unit = super.refresh() | ||
|
|
||
| override def inputFiles: Array[String] = { | ||
| val fileStatusList = allFiles | ||
| fileStatusList.map(_.getPath.toString).toArray | ||
| } | ||
| override def inputFiles: Array[String] = | ||
| allFiles.map(_.getPath.toString).toArray | ||
|
|
||
| override def sizeInBytes: Long = { | ||
| cachedFileSize | ||
| override def sizeInBytes: Long = cachedFileSize | ||
|
|
||
| private def isColumnStatsIndexAvailable = | ||
| HoodieTableMetadataUtil.getCompletedMetadataPartitions(metaClient.getTableConfig) | ||
| .contains(HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS) | ||
|
|
||
| private def isDataSkippingEnabled: Boolean = | ||
| options.getOrElse(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), | ||
| spark.sessionState.conf.getConfString(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "false")).toBoolean | ||
|
|
||
| private def isMetadataTableEnabled: Boolean = metadataConfig.enabled() | ||
| private def isColumnStatsIndexEnabled: Boolean = metadataConfig.isColumnStatsIndexEnabled | ||
|
|
||
| private def validateConfig(): Unit = { | ||
| if (isDataSkippingEnabled && (!isMetadataTableEnabled || !isColumnStatsIndexEnabled)) { | ||
| logWarning("Data skipping requires both Metadata Table and Column Stats Index to be enabled as well! " + | ||
| s"(isMetadataTableEnabled = ${isMetadataTableEnabled}, isColumnStatsIndexEnabled = ${isColumnStatsIndexEnabled}") | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.