-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-1591] [RFC-26] Implement Spark's FileIndex for Hudi to support queries via Hudi DataSource using non-globbed table path and partition pruning #2651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.datasources | ||
|
|
||
| import java.util.TimeZone | ||
|
|
||
| import org.apache.hadoop.fs.Path | ||
| import org.apache.spark.sql.execution.datasources.PartitioningUtils.PartitionValues | ||
| import org.apache.spark.sql.types.DataType | ||
|
|
||
| trait SparkParsePartitionUtil extends Serializable { | ||
|
|
||
| def parsePartition( | ||
| path: Path, | ||
| typeInference: Boolean, | ||
| basePaths: Set[Path], | ||
| userSpecifiedDataTypes: Map[String, DataType], | ||
| timeZone: TimeZone): Option[PartitionValues] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
|
|
||
| package org.apache.hudi.common.table; | ||
|
|
||
| import java.util.Arrays; | ||
| import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex; | ||
| import org.apache.hudi.common.model.HoodieFileFormat; | ||
| import org.apache.hudi.common.model.HoodieTableType; | ||
|
|
@@ -57,6 +58,7 @@ public class HoodieTableConfig implements Serializable { | |
| public static final String HOODIE_TABLE_TYPE_PROP_NAME = "hoodie.table.type"; | ||
| public static final String HOODIE_TABLE_VERSION_PROP_NAME = "hoodie.table.version"; | ||
| public static final String HOODIE_TABLE_PRECOMBINE_FIELD = "hoodie.table.precombine.field"; | ||
| public static final String HOODIE_TABLE_PARTITION_COLUMNS = "hoodie.table.partition.columns"; | ||
|
|
||
| @Deprecated | ||
| public static final String HOODIE_RO_FILE_FORMAT_PROP_NAME = "hoodie.table.ro.file.format"; | ||
|
|
@@ -193,6 +195,14 @@ public String getPreCombineField() { | |
| return props.getProperty(HOODIE_TABLE_PRECOMBINE_FIELD); | ||
| } | ||
|
|
||
| public Option<String[]> getPartitionColumns() { | ||
|
||
| if (props.containsKey(HOODIE_TABLE_PARTITION_COLUMNS)) { | ||
| return Option.of(Arrays.stream(props.getProperty(HOODIE_TABLE_PARTITION_COLUMNS).split(",")) | ||
| .filter(p -> p.length() > 0).collect(Collectors.toList()).toArray(new String[]{})); | ||
| } | ||
| return Option.empty(); | ||
| } | ||
|
|
||
| /** | ||
| * Read the payload class for HoodieRecords from the table properties. | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,14 +19,16 @@ package org.apache.hudi | |
|
|
||
| import org.apache.hadoop.fs.Path | ||
| import org.apache.hudi.DataSourceReadOptions._ | ||
| import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType} | ||
| import org.apache.hudi.common.model.HoodieRecord | ||
| import org.apache.hudi.DataSourceWriteOptions.{BOOTSTRAP_OPERATION_OPT_VAL, OPERATION_OPT_KEY} | ||
| import org.apache.hudi.common.fs.FSUtils | ||
| import org.apache.hudi.common.model.HoodieTableType.{COPY_ON_WRITE, MERGE_ON_READ} | ||
| import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver} | ||
| import org.apache.hudi.exception.HoodieException | ||
| import org.apache.hudi.hadoop.HoodieROTablePathFilter | ||
| import org.apache.log4j.LogManager | ||
| import org.apache.spark.sql.execution.datasources.DataSource | ||
| import org.apache.spark.sql.execution.datasources.{DataSource, FileStatusCache, HadoopFsRelation} | ||
| import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat | ||
| import org.apache.spark.sql.execution.streaming.{Sink, Source} | ||
| import org.apache.spark.sql.hudi.streaming.HoodieStreamSource | ||
| import org.apache.spark.sql.sources._ | ||
|
|
@@ -79,39 +81,53 @@ class DefaultSource extends RelationProvider | |
| val allPaths = path.map(p => Seq(p)).getOrElse(Seq()) ++ readPaths | ||
|
|
||
| val fs = FSUtils.getFs(allPaths.head, sqlContext.sparkContext.hadoopConfiguration) | ||
| val globPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(allPaths, fs) | ||
|
|
||
| val tablePath = DataSourceUtils.getTablePath(fs, globPaths.toArray) | ||
| // Use the HoodieFileIndex only if the 'path' is not globbed. | ||
| // Or else we use the original way to read hoodie table. | ||
| val enableFileIndex = optParams.get(ENABLE_HOODIE_FILE_INDEX) | ||
| .map(_.toBoolean).getOrElse(DEFAULT_ENABLE_HOODIE_FILE_INDEX) | ||
| val useHoodieFileIndex = enableFileIndex && path.isDefined && !path.get.contains("*") && | ||
| !parameters.contains(DataSourceReadOptions.READ_PATHS_OPT_KEY) | ||
| val globPaths = if (useHoodieFileIndex) { | ||
| None | ||
| } else { | ||
| Some(HoodieSparkUtils.checkAndGlobPathIfNecessary(allPaths, fs)) | ||
| } | ||
| // Get the table base path | ||
| val tablePath = if (globPaths.isDefined) { | ||
| DataSourceUtils.getTablePath(fs, globPaths.get.toArray) | ||
| } else { | ||
| DataSourceUtils.getTablePath(fs, Array(new Path(path.get))) | ||
| } | ||
| log.info("Obtained hudi table path: " + tablePath) | ||
|
|
||
| val metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf).setBasePath(tablePath).build() | ||
| val isBootstrappedTable = metaClient.getTableConfig.getBootstrapBasePath.isPresent | ||
| log.info("Is bootstrapped table => " + isBootstrappedTable) | ||
|
|
||
| if (parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_SNAPSHOT_OPT_VAL)) { | ||
| if (metaClient.getTableType.equals(HoodieTableType.MERGE_ON_READ)) { | ||
| if (isBootstrappedTable) { | ||
| // Snapshot query is not supported for Bootstrapped MOR tables | ||
| log.warn("Snapshot query is not supported for Bootstrapped Merge-on-Read tables." + | ||
| " Falling back to Read Optimized query.") | ||
| new HoodieBootstrapRelation(sqlContext, schema, globPaths, metaClient, optParams) | ||
| } else { | ||
| new MergeOnReadSnapshotRelation(sqlContext, optParams, schema, globPaths, metaClient) | ||
| } | ||
| } else { | ||
| getBaseFileOnlyView(sqlContext, parameters, schema, readPaths, isBootstrappedTable, globPaths, metaClient) | ||
| } | ||
| } else if(parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)) { | ||
| getBaseFileOnlyView(sqlContext, parameters, schema, readPaths, isBootstrappedTable, globPaths, metaClient) | ||
| } else if (parameters(QUERY_TYPE_OPT_KEY).equals(QUERY_TYPE_INCREMENTAL_OPT_VAL)) { | ||
| val metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf).setBasePath(tablePath).build() | ||
| if (metaClient.getTableType.equals(HoodieTableType.MERGE_ON_READ)) { | ||
| new MergeOnReadIncrementalRelation(sqlContext, optParams, schema, metaClient) | ||
| } else { | ||
| new IncrementalRelation(sqlContext, optParams, schema, metaClient) | ||
| } | ||
| } else { | ||
| throw new HoodieException("Invalid query type :" + parameters(QUERY_TYPE_OPT_KEY)) | ||
| val tableType = metaClient.getTableType | ||
| val queryType = parameters(QUERY_TYPE_OPT_KEY) | ||
| log.info(s"Is bootstrapped table => $isBootstrappedTable, tableType is: $tableType") | ||
|
|
||
| (tableType, queryType, isBootstrappedTable) match { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is very neat. thanks @pengzhiwei2018 ! |
||
| case (COPY_ON_WRITE, QUERY_TYPE_SNAPSHOT_OPT_VAL, false) | | ||
| (COPY_ON_WRITE, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) | | ||
|
||
| (MERGE_ON_READ, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) => | ||
| getBaseFileOnlyView(useHoodieFileIndex, sqlContext, parameters, schema, tablePath, | ||
| readPaths, metaClient) | ||
|
|
||
| case (COPY_ON_WRITE, QUERY_TYPE_INCREMENTAL_OPT_VAL, _) => | ||
|
||
| new IncrementalRelation(sqlContext, parameters, schema, metaClient) | ||
|
|
||
| case (MERGE_ON_READ, QUERY_TYPE_SNAPSHOT_OPT_VAL, false) => | ||
| new MergeOnReadSnapshotRelation(sqlContext, parameters, schema, globPaths, metaClient) | ||
|
|
||
| case (MERGE_ON_READ, QUERY_TYPE_INCREMENTAL_OPT_VAL, _) => | ||
| new MergeOnReadIncrementalRelation(sqlContext, parameters, schema, metaClient) | ||
|
|
||
| case (_, _, true) => | ||
| new HoodieBootstrapRelation(sqlContext, schema, globPaths, metaClient, parameters) | ||
|
|
||
| case (_, _, _) => | ||
| throw new HoodieException(s"Invalid query type : $queryType for tableType: $tableType," + | ||
| s"isBootstrappedTable: $isBootstrappedTable ") | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -162,18 +178,28 @@ class DefaultSource extends RelationProvider | |
|
|
||
| override def shortName(): String = "hudi" | ||
|
|
||
| private def getBaseFileOnlyView(sqlContext: SQLContext, | ||
| private def getBaseFileOnlyView(useHoodieFileIndex: Boolean, | ||
| sqlContext: SQLContext, | ||
| optParams: Map[String, String], | ||
| schema: StructType, | ||
| tablePath: String, | ||
| extraReadPaths: Seq[String], | ||
| isBootstrappedTable: Boolean, | ||
| globPaths: Seq[Path], | ||
| metaClient: HoodieTableMetaClient): BaseRelation = { | ||
| log.warn("Loading Base File Only View.") | ||
| log.info("Loading Base File Only View with options :" + optParams) | ||
|
|
||
| if (useHoodieFileIndex) { | ||
|
|
||
| val fileIndex = HoodieFileIndex(sqlContext.sparkSession, metaClient, | ||
| if (schema == null) Option.empty[StructType] else Some(schema), | ||
| optParams, FileStatusCache.getOrCreate(sqlContext.sparkSession)) | ||
|
|
||
| if (isBootstrappedTable) { | ||
| // For bootstrapped tables, use our custom Spark relation for querying | ||
| new HoodieBootstrapRelation(sqlContext, schema, globPaths, metaClient, optParams) | ||
| HadoopFsRelation( | ||
| fileIndex, | ||
| fileIndex.partitionSchema, | ||
| fileIndex.dataSchema, | ||
| bucketSpec = None, | ||
| fileFormat = new ParquetFileFormat, | ||
| optParams)(sqlContext.sparkSession) | ||
| } else { | ||
| // this is just effectively RO view only, where `path` can contain a mix of | ||
| // non-hoodie/hoodie path files. set the path filter up | ||
|
|
@@ -182,7 +208,6 @@ class DefaultSource extends RelationProvider | |
| classOf[HoodieROTablePathFilter], | ||
| classOf[org.apache.hadoop.fs.PathFilter]) | ||
|
|
||
| log.info("Constructing hoodie (as parquet) data source with options :" + optParams) | ||
| // simply return as a regular parquet relation | ||
| DataSource.apply( | ||
| sparkSession = sqlContext.sparkSession, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should persist the key generator class. and not the partition columns themselves? let me think over this more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @vinothchandar , we need the partition schema for partition prune for spark sql. So the partition columns is need for that. Or we cannot get the partition schema.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how does this work for existing tables? do we need an upgrade-downgrade step for writing this to
hoodie.properties?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For existing tables without store the partition columns, we query it as a non-partitioned table.