-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-37273][SQL] Support hidden file metadata columns in Spark SQL #34575
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
dee06f6
06ac79e
fc043fd
170378b
73593c5
c531300
bd28eb7
e872d1f
60bdbc5
f78fe92
2baccdb
8b8b9fa
d984b50
0f6eccd
f780bf2
a0a538c
3516e4e
00bda90
afa0a83
65e79ab
3b3d635
4400f6a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -35,6 +35,7 @@ import org.apache.spark.sql.execution.datasources._ | |||||||||||||||||||||||||
| import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat => ParquetSource} | ||||||||||||||||||||||||||
| import org.apache.spark.sql.execution.datasources.v2.PushedDownOperators | ||||||||||||||||||||||||||
| import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} | ||||||||||||||||||||||||||
| import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector | ||||||||||||||||||||||||||
| import org.apache.spark.sql.internal.SQLConf | ||||||||||||||||||||||||||
| import org.apache.spark.sql.sources.{BaseRelation, Filter} | ||||||||||||||||||||||||||
| import org.apache.spark.sql.types.StructType | ||||||||||||||||||||||||||
|
|
@@ -194,10 +195,17 @@ case class FileSourceScanExec( | |||||||||||||||||||||||||
| disableBucketedScan: Boolean = false) | ||||||||||||||||||||||||||
| extends DataSourceScanExec { | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| lazy val metadataStructCol: Option[AttributeReference] = | ||||||||||||||||||||||||||
| output.collectFirst { case MetadataAttribute(attr) => attr } | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| // Note that some vals referring the file-based relation are lazy intentionally | ||||||||||||||||||||||||||
| // so that this plan can be canonicalized on executor side too. See SPARK-23731. | ||||||||||||||||||||||||||
| override lazy val supportsColumnar: Boolean = { | ||||||||||||||||||||||||||
| relation.fileFormat.supportBatch(relation.sparkSession, schema) | ||||||||||||||||||||||||||
| // schema without the file metadata struct column | ||||||||||||||||||||||||||
| val fileSchema = if (metadataStructCol.isEmpty) schema else { | ||||||||||||||||||||||||||
| output.filter(_.exprId != metadataStructCol.get.exprId).toStructType | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| relation.fileFormat.supportBatch(relation.sparkSession, fileSchema) | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| private lazy val needsUnsafeRowConversion: Boolean = { | ||||||||||||||||||||||||||
|
|
@@ -212,7 +220,12 @@ case class FileSourceScanExec( | |||||||||||||||||||||||||
| relation.fileFormat.vectorTypes( | ||||||||||||||||||||||||||
| requiredSchema = requiredSchema, | ||||||||||||||||||||||||||
| partitionSchema = relation.partitionSchema, | ||||||||||||||||||||||||||
| relation.sparkSession.sessionState.conf) | ||||||||||||||||||||||||||
| relation.sparkSession.sessionState.conf).map { vectorTypes => | ||||||||||||||||||||||||||
| // for column-based file format, append metadata struct column's vector type classes if any | ||||||||||||||||||||||||||
| vectorTypes ++ (if (metadataStructCol.isDefined) { | ||||||||||||||||||||||||||
| Seq(classOf[OnHeapColumnVector].getName) | ||||||||||||||||||||||||||
| } else Seq.empty) | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| private lazy val driverMetrics: HashMap[String, Long] = HashMap.empty | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
|
|
@@ -355,7 +368,15 @@ case class FileSourceScanExec( | |||||||||||||||||||||||||
| @transient | ||||||||||||||||||||||||||
| private lazy val pushedDownFilters = { | ||||||||||||||||||||||||||
| val supportNestedPredicatePushdown = DataSourceUtils.supportNestedPredicatePushdown(relation) | ||||||||||||||||||||||||||
| dataFilters.flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown)) | ||||||||||||||||||||||||||
| // TODO: should be able to push filters containing metadata struct down to skip files | ||||||||||||||||||||||||||
| dataFilters | ||||||||||||||||||||||||||
| .filterNot( | ||||||||||||||||||||||||||
| _.references.exists { | ||||||||||||||||||||||||||
| case MetadataAttribute(_) => true | ||||||||||||||||||||||||||
cloud-fan marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||||||||||||||||||||||
| case _ => false | ||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||
| .flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown)) | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
| dataFilters | |
| .filterNot( | |
| _.references.exists { | |
| case MetadataAttribute(_) => true | |
| case _ => false | |
| } | |
| ) | |
| .flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown)) | |
| dataFilters.filterNot(_.references.exists { | |
| case MetadataAttribute(_) => true | |
| case _ => false | |
| }).flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown)) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjectio | |
| import org.apache.spark.sql.errors.QueryExecutionErrors | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.sources.Filter | ||
| import org.apache.spark.sql.types.{DataType, StructType} | ||
| import org.apache.spark.sql.types.{DataType, LongType, StringType, StructField, StructType} | ||
|
|
||
|
|
||
| /** | ||
|
|
@@ -171,6 +171,29 @@ trait FileFormat { | |
| def supportFieldName(name: String): Boolean = true | ||
| } | ||
|
|
||
| object FileFormat { | ||
|
|
||
| val FILE_PATH = "file_path" | ||
|
|
||
| val FILE_NAME = "file_name" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wondering do we also plan to deprecate existing expression
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. I think we should, as |
||
|
|
||
| val FILE_SIZE = "file_size" | ||
|
|
||
| val FILE_MODIFICATION_TIME = "file_modification_time" | ||
|
|
||
| val METADATA_NAME = "_metadata" | ||
|
|
||
| // supported metadata struct fields for hadoop fs relation | ||
| val METADATA_STRUCT: StructType = new StructType() | ||
| .add(StructField(FILE_PATH, StringType)) | ||
| .add(StructField(FILE_NAME, StringType)) | ||
| .add(StructField(FILE_SIZE, LongType)) | ||
| .add(StructField(FILE_MODIFICATION_TIME, LongType)) | ||
|
||
|
|
||
| // create a file metadata struct col | ||
| def createFileMetadataCol: AttributeReference = MetadataAttribute(METADATA_NAME, METADATA_STRUCT) | ||
| } | ||
|
|
||
| /** | ||
| * The base class file format that is based on text file. | ||
| */ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we assuming that only 1 column in
outputcorresponds toMetadataAttribute(_)? Is there some place in code where we are enforcing this?