-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-4420] Fixing table schema delineation on partition/data schema for Spark relations #5708
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
57755d3
01a54f1
c77a31c
502e93e
443ee26
ca3024a
b0e9209
9b5f3cd
6ffb5aa
2a63374
be05a41
eb5f747
f9beb7e
b29986e
807927b
b450eec
5b6281e
1f3cd58
d66c47c
8f0c173
d9be73d
381d9af
4865e96
e121b00
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,7 +45,7 @@ import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex | |
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression} | ||
| import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression, UnsafeProjection} | ||
| import org.apache.spark.sql.execution.FileRelation | ||
| import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat | ||
| import org.apache.spark.sql.execution.datasources.parquet.{HoodieParquetFileFormat, ParquetFileFormat} | ||
|
|
@@ -274,7 +274,7 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, | |
| def canPruneRelationSchema: Boolean = | ||
| (fileFormat.isInstanceOf[ParquetFileFormat] || fileFormat.isInstanceOf[OrcFileFormat]) && | ||
| // NOTE: Some relations might be disabling sophisticated schema pruning techniques (for ex, nested schema pruning) | ||
| // TODO(HUDI-XXX) internal schema doesn't supported nested schema pruning currently | ||
| // TODO(HUDI-XXX) internal schema doesn't support nested schema pruning currently | ||
| !hasSchemaOnRead | ||
|
|
||
| override def schema: StructType = { | ||
|
|
@@ -334,58 +334,32 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, | |
| val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchemaStr, internalSchemaOpt) | ||
| val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString, Some(requiredInternalSchema)) | ||
|
|
||
| // Since schema requested by the caller might contain partition columns, we might need to | ||
| // prune it, removing all partition columns from it in case these columns are not persisted | ||
| // in the data files | ||
| // | ||
| // NOTE: This partition schema is only relevant to file reader to be able to embed | ||
| // values of partition columns (hereafter referred to as partition values) encoded into | ||
| // the partition path, and omitted from the data file, back into fetched rows; | ||
| // Note that, by default, partition columns are not omitted therefore specifying | ||
| // partition schema for reader is not required | ||
| val (partitionSchema, dataSchema, requiredDataSchema) = | ||
| tryPrunePartitionColumns(tableSchema, requiredSchema) | ||
|
|
||
| if (fileSplits.isEmpty) { | ||
| sparkSession.sparkContext.emptyRDD | ||
| } else { | ||
| val rdd = composeRDD(fileSplits, partitionSchema, dataSchema, requiredDataSchema, targetColumns, filters) | ||
|
|
||
| // NOTE: In case when partition columns have been pruned from the required schema, we have to project | ||
| // the rows from the pruned schema back into the one expected by the caller | ||
| val projectedRDD = if (requiredDataSchema.structTypeSchema != requiredSchema.structTypeSchema) { | ||
| rdd.mapPartitions { it => | ||
| val fullPrunedSchema = StructType(requiredDataSchema.structTypeSchema.fields ++ partitionSchema.fields) | ||
| val unsafeProjection = generateUnsafeProjection(fullPrunedSchema, requiredSchema.structTypeSchema) | ||
| it.map(unsafeProjection) | ||
| } | ||
| } else { | ||
| rdd | ||
| } | ||
| val rdd = composeRDD(fileSplits, tableSchema, requiredSchema, targetColumns, filters) | ||
|
|
||
| // Here we rely on a type erasure, to workaround inherited API restriction and pass [[RDD[InternalRow]]] back as [[RDD[Row]]] | ||
| // Please check [[needConversion]] scala-doc for more details | ||
| projectedRDD.asInstanceOf[RDD[Row]] | ||
| rdd.asInstanceOf[RDD[Row]] | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Composes RDD provided file splits to read from, table and partition schemas, data filters to be applied | ||
| * | ||
| * @param fileSplits file splits to be handled by the RDD | ||
| * @param partitionSchema target table's partition schema | ||
| * @param dataSchema target table's data files' schema | ||
| * @param tableSchema target table's schema | ||
| * @param requiredSchema projected schema required by the reader | ||
| * @param requestedColumns columns requested by the query | ||
| * @param filters data filters to be applied | ||
| * @return instance of RDD (implementing [[HoodieUnsafeRDD]]) | ||
| * @return instance of RDD (holding [[InternalRow]]s) | ||
| */ | ||
| protected def composeRDD(fileSplits: Seq[FileSplit], | ||
| partitionSchema: StructType, | ||
| dataSchema: HoodieTableSchema, | ||
| tableSchema: HoodieTableSchema, | ||
| requiredSchema: HoodieTableSchema, | ||
| requestedColumns: Array[String], | ||
| filters: Array[Filter]): HoodieUnsafeRDD | ||
| filters: Array[Filter]): RDD[InternalRow] | ||
|
|
||
| /** | ||
| * Provided with partition and date filters collects target file splits to read records from, while | ||
|
|
@@ -553,7 +527,7 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, | |
| protected def createBaseFileReader(spark: SparkSession, | ||
| partitionSchema: StructType, | ||
| dataSchema: HoodieTableSchema, | ||
| requiredSchema: HoodieTableSchema, | ||
| requiredDataSchema: HoodieTableSchema, | ||
| filters: Seq[Filter], | ||
| options: Map[String, String], | ||
| hadoopConf: Configuration): BaseFileReader = { | ||
|
|
@@ -564,42 +538,56 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, | |
| // we have to eagerly initialize all of the readers even though only one specific to the type | ||
| // of the file being read will be used. This is required to avoid serialization of the whole | ||
| // relation (containing file-index for ex) and passing it to the executor | ||
| val reader = tableBaseFileFormat match { | ||
| case HoodieFileFormat.PARQUET => | ||
| HoodieDataSourceHelper.buildHoodieParquetReader( | ||
| sparkSession = spark, | ||
| dataSchema = dataSchema.structTypeSchema, | ||
| partitionSchema = partitionSchema, | ||
| requiredSchema = requiredSchema.structTypeSchema, | ||
| filters = filters, | ||
| options = options, | ||
| hadoopConf = hadoopConf, | ||
| // We're delegating to Spark to append partition values to every row only in cases | ||
| // when these corresponding partition-values are not persisted w/in the data file itself | ||
| appendPartitionValues = shouldExtractPartitionValuesFromPartitionPath | ||
| ) | ||
| val (read: (PartitionedFile => Iterator[InternalRow]), schema: StructType) = | ||
| tableBaseFileFormat match { | ||
| case HoodieFileFormat.PARQUET => | ||
| val parquetReader = HoodieDataSourceHelper.buildHoodieParquetReader( | ||
| sparkSession = spark, | ||
| dataSchema = dataSchema.structTypeSchema, | ||
| partitionSchema = partitionSchema, | ||
| requiredSchema = requiredDataSchema.structTypeSchema, | ||
| filters = filters, | ||
| options = options, | ||
| hadoopConf = hadoopConf, | ||
| // We're delegating to Spark to append partition values to every row only in cases | ||
| // when these corresponding partition-values are not persisted w/in the data file itself | ||
| appendPartitionValues = shouldExtractPartitionValuesFromPartitionPath | ||
| ) | ||
| // Since partition values by default are omitted, and not persisted w/in data-files by Spark, | ||
| // data-file readers (such as [[ParquetFileFormat]]) have to inject partition values while reading | ||
| // the data. As such, actual full schema produced by such reader is composed of | ||
| // a) Data-file schema (projected or not) | ||
| // b) Appended partition column values | ||
| val readerSchema = StructType(requiredDataSchema.structTypeSchema.fields ++ partitionSchema.fields) | ||
|
|
||
| (parquetReader, readerSchema) | ||
|
|
||
| case HoodieFileFormat.HFILE => | ||
| createHFileReader( | ||
| val hfileReader = createHFileReader( | ||
| spark = spark, | ||
| dataSchema = dataSchema, | ||
| requiredSchema = requiredSchema, | ||
| requiredDataSchema = requiredDataSchema, | ||
| filters = filters, | ||
| options = options, | ||
| hadoopConf = hadoopConf | ||
| ) | ||
|
|
||
| (hfileReader, requiredDataSchema.structTypeSchema) | ||
|
|
||
| case _ => throw new UnsupportedOperationException(s"Base file format is not currently supported ($tableBaseFileFormat)") | ||
| } | ||
|
|
||
| partitionedFile => { | ||
| val extension = FSUtils.getFileExtension(partitionedFile.filePath) | ||
| if (tableBaseFileFormat.getFileExtension.equals(extension)) { | ||
| reader.apply(partitionedFile) | ||
| } else { | ||
| throw new UnsupportedOperationException(s"Invalid base-file format ($extension), expected ($tableBaseFileFormat)") | ||
| } | ||
| } | ||
| BaseFileReader( | ||
| read = partitionedFile => { | ||
| val extension = FSUtils.getFileExtension(partitionedFile.filePath) | ||
| if (tableBaseFileFormat.getFileExtension.equals(extension)) { | ||
| read(partitionedFile) | ||
| } else { | ||
| throw new UnsupportedOperationException(s"Invalid base-file format ($extension), expected ($tableBaseFileFormat)") | ||
| } | ||
| }, | ||
| schema = schema | ||
| ) | ||
| } | ||
|
|
||
| protected def embedInternalSchema(conf: Configuration, internalSchemaOpt: Option[InternalSchema]): Configuration = { | ||
|
|
@@ -615,8 +603,17 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, | |
| conf | ||
| } | ||
|
|
||
| private def tryPrunePartitionColumns(tableSchema: HoodieTableSchema, | ||
| requiredSchema: HoodieTableSchema): (StructType, HoodieTableSchema, HoodieTableSchema) = { | ||
| protected def tryPrunePartitionColumns(tableSchema: HoodieTableSchema, | ||
| requiredSchema: HoodieTableSchema): (StructType, HoodieTableSchema, HoodieTableSchema) = { | ||
| // Since schema requested by the caller might contain partition columns, we might need to | ||
| // prune it, removing all partition columns from it in case these columns are not persisted | ||
| // in the data files | ||
| // | ||
| // NOTE: This partition schema is only relevant to file reader to be able to embed | ||
| // values of partition columns (hereafter referred to as partition values) encoded into | ||
| // the partition path, and omitted from the data file, back into fetched rows; | ||
| // Note that, by default, partition columns are not omitted therefore specifying | ||
| // partition schema for reader is not required | ||
| if (shouldExtractPartitionValuesFromPartitionPath) { | ||
| val partitionSchema = StructType(partitionColumns.map(StructField(_, StringType))) | ||
| val prunedDataStructSchema = prunePartitionColumns(tableSchema.structTypeSchema) | ||
|
|
@@ -645,17 +642,45 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext, | |
|
|
||
| object HoodieBaseRelation extends SparkAdapterSupport { | ||
|
|
||
| type BaseFileReader = PartitionedFile => Iterator[InternalRow] | ||
| case class BaseFileReader(read: PartitionedFile => Iterator[InternalRow], val schema: StructType) { | ||
| def apply(file: PartitionedFile): Iterator[InternalRow] = read.apply(file) | ||
| } | ||
|
|
||
| private def generateUnsafeProjection(from: StructType, to: StructType) = | ||
| sparkAdapter.getCatalystExpressionUtils().generateUnsafeProjection(from, to) | ||
| def generateUnsafeProjection(from: StructType, to: StructType): UnsafeProjection = | ||
| sparkAdapter.getCatalystExpressionUtils.generateUnsafeProjection(from, to) | ||
|
|
||
| def convertToAvroSchema(structSchema: StructType): Schema = | ||
| sparkAdapter.getAvroSchemaConverters.toAvroType(structSchema, nullable = false, "Record") | ||
|
|
||
| def getPartitionPath(fileStatus: FileStatus): Path = | ||
| fileStatus.getPath.getParent | ||
|
|
||
| /** | ||
| * Projects provided file reader's output from its original schema, into a [[requiredSchema]] | ||
| * | ||
| * NOTE: [[requiredSchema]] has to be a proper subset of the file reader's schema | ||
| * | ||
| * @param reader file reader to be projected | ||
| * @param requiredSchema target schema for the output of the provided file reader | ||
| */ | ||
| def projectReader(reader: BaseFileReader, requiredSchema: StructType): BaseFileReader = { | ||
| checkState(reader.schema.fields.toSet.intersect(requiredSchema.fields.toSet).size == requiredSchema.size) | ||
|
|
||
| if (reader.schema == requiredSchema) { | ||
| reader | ||
| } else { | ||
| val read = reader.apply(_) | ||
| val projectedRead: PartitionedFile => Iterator[InternalRow] = (file: PartitionedFile) => { | ||
| // NOTE: Projection is not a serializable object, hence it creation should only happen w/in | ||
| // the executor process | ||
| val unsafeProjection = generateUnsafeProjection(reader.schema, requiredSchema) | ||
| read(file).map(unsafeProjection) | ||
| } | ||
|
|
||
| BaseFileReader(projectedRead, requiredSchema) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why we still need requriedSchema?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please check my comment where this method is used for an example: whenever we prune partition columns, ordering of the columns would change (partition ones will be removed and then appended to the resulting schema) therefore without projecting back into required schema caller will get dataset that will have incorrect ordering of the columns |
||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Projects provided schema by picking only required (projected) top-level columns from it | ||
| * | ||
|
|
@@ -666,7 +691,6 @@ object HoodieBaseRelation extends SparkAdapterSupport { | |
| tableSchema match { | ||
| case Right(internalSchema) => | ||
| checkState(!internalSchema.isEmptySchema) | ||
| // TODO extend pruning to leverage optimizer pruned schema | ||
| val prunedInternalSchema = InternalSchemaUtils.pruneInternalSchema(internalSchema, requiredColumns.toList.asJava) | ||
| val requiredAvroSchema = AvroInternalSchemaConverter.convert(prunedInternalSchema, "schema") | ||
| val requiredStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(requiredAvroSchema) | ||
|
|
@@ -691,10 +715,10 @@ object HoodieBaseRelation extends SparkAdapterSupport { | |
|
|
||
| private def createHFileReader(spark: SparkSession, | ||
| dataSchema: HoodieTableSchema, | ||
| requiredSchema: HoodieTableSchema, | ||
| requiredDataSchema: HoodieTableSchema, | ||
| filters: Seq[Filter], | ||
| options: Map[String, String], | ||
| hadoopConf: Configuration): BaseFileReader = { | ||
| hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { | ||
| val hadoopConfBroadcast = | ||
| spark.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) | ||
|
|
||
|
|
@@ -703,10 +727,10 @@ object HoodieBaseRelation extends SparkAdapterSupport { | |
| val reader = new HoodieHFileReader[GenericRecord](hadoopConf, new Path(partitionedFile.filePath), | ||
| new CacheConfig(hadoopConf)) | ||
|
|
||
| val requiredRowSchema = requiredSchema.structTypeSchema | ||
| val requiredRowSchema = requiredDataSchema.structTypeSchema | ||
| // NOTE: Schema has to be parsed at this point, since Avro's [[Schema]] aren't serializable | ||
| // to be passed from driver to executor | ||
| val requiredAvroSchema = new Schema.Parser().parse(requiredSchema.avroSchemaStr) | ||
| val requiredAvroSchema = new Schema.Parser().parse(requiredDataSchema.avroSchemaStr) | ||
| val avroToRowConverter = AvroConversionUtils.createAvroToInternalRowConverter(requiredAvroSchema, requiredRowSchema) | ||
|
|
||
| reader.getRecordIterator(requiredAvroSchema).asScala | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pls raise a PR, and I will repair it later
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You mean, JIRA, right? Will do