-
Notifications
You must be signed in to change notification settings - Fork 267
[comet-parquet-exec] Use Parquet schema for scan instead of Spark schema #1103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
563178c
3f1be1d
0a233b9
8adaf95
20828bd
c9ca89a
de6425a
0fdfd9d
5c7b73f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,25 +29,26 @@ import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, Normalize | |
| import org.apache.spark.sql.catalyst.plans._ | ||
| import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning, SinglePartition} | ||
| import org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils | ||
| import org.apache.spark.sql.comet.{CometBroadcastExchangeExec, CometNativeScanExec, CometScanExec, CometSinkPlaceHolder, CometSparkToColumnarExec, DecimalPrecision} | ||
| import org.apache.spark.sql.comet.{CometBroadcastExchangeExec, CometScanExec, CometSinkPlaceHolder, CometSparkToColumnarExec, DecimalPrecision} | ||
| import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec | ||
| import org.apache.spark.sql.execution | ||
| import org.apache.spark.sql.execution._ | ||
| import org.apache.spark.sql.execution.adaptive.{BroadcastQueryStageExec, ShuffleQueryStageExec} | ||
| import org.apache.spark.sql.execution.aggregate.{BaseAggregateExec, HashAggregateExec, ObjectHashAggregateExec} | ||
| import org.apache.spark.sql.execution.datasources.{FilePartition, FileScanRDD} | ||
| import org.apache.spark.sql.execution.datasources.parquet.SparkToParquetSchemaConverter | ||
| import org.apache.spark.sql.execution.datasources.v2.{DataSourceRDD, DataSourceRDDPartition} | ||
| import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchangeExec} | ||
| import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, HashJoin, ShuffledHashJoinExec, SortMergeJoinExec} | ||
| import org.apache.spark.sql.execution.window.WindowExec | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
| import org.apache.spark.util.SerializableConfiguration | ||
|
|
||
| import org.apache.comet.CometConf | ||
| import org.apache.comet.CometSparkSessionExtensions.{isCometScan, isSpark34Plus, withInfo} | ||
| import org.apache.comet.expressions.{CometCast, CometEvalMode, Compatible, Incompatible, RegExp, Unsupported} | ||
| import org.apache.comet.parquet.FooterReader | ||
| import org.apache.comet.serde.ExprOuterClass.{AggExpr, DataType => ProtoDataType, Expr, ScalarFunc} | ||
| import org.apache.comet.serde.ExprOuterClass.DataType.{DataTypeInfo, DecimalInfo, ListInfo, MapInfo, StructInfo} | ||
| import org.apache.comet.serde.OperatorOuterClass.{AggregateMode => CometAggregateMode, BuildSide, JoinType, Operator} | ||
|
|
@@ -2507,23 +2508,22 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde with CometExprShim | |
| partitions.foreach(p => { | ||
| val inputPartitions = p.asInstanceOf[DataSourceRDDPartition].inputPartitions | ||
| inputPartitions.foreach(partition => { | ||
| partition2Proto(partition.asInstanceOf[FilePartition], nativeScanBuilder) | ||
| partition2Proto(partition.asInstanceOf[FilePartition], nativeScanBuilder, scan) | ||
| }) | ||
| }) | ||
| case rdd: FileScanRDD => | ||
| rdd.filePartitions.foreach(partition => { | ||
| partition2Proto(partition, nativeScanBuilder) | ||
| partition2Proto(partition, nativeScanBuilder, scan) | ||
| }) | ||
| case _ => | ||
| assert(false) | ||
| } | ||
|
|
||
| val requiredSchemaParquet = | ||
| new SparkToParquetSchemaConverter(conf).convert(scan.requiredSchema) | ||
| val dataSchemaParquet = | ||
| new SparkToParquetSchemaConverter(conf).convert(scan.relation.dataSchema) | ||
| val projection_vector: Array[java.lang.Long] = scan.requiredSchema.fields.map(field => { | ||
| scan.relation.dataSchema.fieldIndex(field.name).toLong.asInstanceOf[java.lang.Long] | ||
| }) | ||
|
|
||
| nativeScanBuilder.setRequiredSchema(requiredSchemaParquet.toString) | ||
| nativeScanBuilder.setDataSchema(dataSchemaParquet.toString) | ||
| nativeScanBuilder.addAllProjectionVector(projection_vector.toIterable.asJava) | ||
|
|
||
| Some(result.setNativeScan(nativeScanBuilder).build()) | ||
|
|
||
|
|
@@ -3191,9 +3191,34 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde with CometExprShim | |
|
|
||
| private def partition2Proto( | ||
| partition: FilePartition, | ||
| nativeScanBuilder: OperatorOuterClass.NativeScan.Builder): Unit = { | ||
| nativeScanBuilder: OperatorOuterClass.NativeScan.Builder, | ||
| scan: CometScanExec): Unit = { | ||
| val partitionBuilder = OperatorOuterClass.SparkFilePartition.newBuilder() | ||
| val sparkContext = scan.session.sparkContext | ||
| var schema_saved: Boolean = false; | ||
| partition.files.foreach(file => { | ||
| if (!schema_saved) { | ||
| // TODO: This code shouldn't be here, but for POC it's fine. | ||
| // Extract the schema and stash it. | ||
| val hadoopConf = | ||
| scan.relation.sparkSession.sessionState.newHadoopConfWithOptions(scan.relation.options) | ||
| val broadcastedHadoopConf = | ||
| sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) | ||
| val sharedConf = broadcastedHadoopConf.value.value | ||
| val footer = FooterReader.readFooter(sharedConf, file) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right. This can never be in production code. For one, this is expensive.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In theory it's just replacing the call that currently takes place during the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't know about a footersCache in Spark. Could you share a link maybe?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, thanks. I don't think we ever travel this path. |
||
| val footerFileMetaData = footer.getFileMetaData | ||
| val schema = footerFileMetaData.getSchema | ||
| nativeScanBuilder.setDataSchema(schema.toString) | ||
|
|
||
| // val clippedSchema = CometParquetReadSupport.clipParquetSchema( | ||
| // schema, | ||
| // scan.requiredSchema, | ||
| // scan.session.sessionState.conf.caseSensitiveAnalysis, | ||
| // CometParquetUtils.readFieldId(scan.session.sessionState.conf), | ||
| // CometParquetUtils.ignoreMissingIds(scan.session.sessionState.conf)) | ||
|
|
||
| schema_saved = true | ||
| } | ||
| val fileBuilder = OperatorOuterClass.SparkPartitionedFile.newBuilder() | ||
| fileBuilder | ||
| .setFilePath(file.pathUri.toString) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change essentially means that any schema 'adaptation' made in
SparkToParquetSchemaConverter.convertto support legacy timestamps and decimals will not be supported. But we will probably fail tests with incorrect results.Also, Comet's Parquet file reader uses
CometParquetReadSupport.clipParquetSchemato do similar conversion and it includes support for Parquet field_id which is desirable for delta sources like Iceberg.Basically a field_id, if present, identifies a field more precisely (in the event of field name changes) in a schema.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
SchemaConverterseems like it could be handled in DF'sSchemaAdapter. I'll look atclipParquetSchemaas well, thanks!There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you just need Arrow types, can you just convert Spark types to Arrow types? For example, if the column in Spark is treated as timestamp type, its Arrow type is timestamp too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My concern is that...
...may yield different results than:
I guess I could exhaustively test this hypothesis with all types.