-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-37273][SQL] Support hidden file metadata columns in Spark SQL #34575
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
dee06f6
06ac79e
fc043fd
170378b
73593c5
c531300
bd28eb7
e872d1f
60bdbc5
f78fe92
2baccdb
8b8b9fa
d984b50
0f6eccd
f780bf2
a0a538c
3516e4e
00bda90
afa0a83
65e79ab
3b3d635
4400f6a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,13 +21,20 @@ import java.io.{Closeable, FileNotFoundException, IOException} | |
|
|
||
| import scala.util.control.NonFatal | ||
|
|
||
| import org.apache.hadoop.fs.Path | ||
|
|
||
| import org.apache.spark.{Partition => RDDPartition, SparkUpgradeException, TaskContext} | ||
| import org.apache.spark.deploy.SparkHadoopUtil | ||
| import org.apache.spark.rdd.{InputFileBlockHolder, RDD} | ||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.expressions.{AttributeReference, GenericInternalRow, JoinedRow, UnsafeProjection, UnsafeRow} | ||
| import org.apache.spark.sql.errors.QueryExecutionErrors | ||
| import org.apache.spark.sql.execution.datasources.FileFormat._ | ||
| import org.apache.spark.sql.execution.vectorized.{OnHeapColumnVector, WritableColumnVector} | ||
| import org.apache.spark.sql.types.{LongType, StringType, StructType} | ||
| import org.apache.spark.sql.vectorized.ColumnarBatch | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
| import org.apache.spark.util.NextIterator | ||
|
|
||
| /** | ||
|
|
@@ -38,14 +45,17 @@ import org.apache.spark.util.NextIterator | |
| * @param filePath URI of the file to read | ||
| * @param start the beginning offset (in bytes) of the block. | ||
| * @param length number of bytes to read. | ||
| * @param locations locality information (list of nodes that have the data). | ||
| * @param modificationTime The modification time of the input file, in milliseconds. | ||
| * @param fileSize The length of the input file (not the block), in bytes. | ||
| */ | ||
| case class PartitionedFile( | ||
| partitionValues: InternalRow, | ||
| filePath: String, | ||
| start: Long, | ||
| length: Long, | ||
| @transient locations: Array[String] = Array.empty) { | ||
| @transient locations: Array[String] = Array.empty, | ||
| modificationTime: Long = 0L, | ||
| fileSize: Long = 0L) { | ||
| override def toString: String = { | ||
| s"path: $filePath, range: $start-${start + length}, partition values: $partitionValues" | ||
| } | ||
|
|
@@ -57,7 +67,9 @@ case class PartitionedFile( | |
| class FileScanRDD( | ||
| @transient private val sparkSession: SparkSession, | ||
| readFunction: (PartitionedFile) => Iterator[InternalRow], | ||
| @transient val filePartitions: Seq[FilePartition]) | ||
| @transient val filePartitions: Seq[FilePartition], | ||
| val readDataSchema: StructType, | ||
| val metadataColumns: Seq[AttributeReference] = Seq.empty) | ||
| extends RDD[InternalRow](sparkSession.sparkContext, Nil) { | ||
|
|
||
| private val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles | ||
|
|
@@ -103,6 +115,101 @@ class FileScanRDD( | |
| context.killTaskIfInterrupted() | ||
| (currentIterator != null && currentIterator.hasNext) || nextIterator() | ||
| } | ||
|
|
||
| /////////////////////////// | ||
| // FILE METADATA METHODS // | ||
| /////////////////////////// | ||
|
|
||
| // a metadata internal row, will only be updated when the current file is changed | ||
| val metadataRow: InternalRow = new GenericInternalRow(metadataColumns.length) | ||
|
|
||
| // an unsafe projection to convert a joined internal row to an unsafe row | ||
| private lazy val projection = { | ||
| val joinedExpressions = | ||
| readDataSchema.fields.map(_.dataType) ++ metadataColumns.map(_.dataType) | ||
| UnsafeProjection.create(joinedExpressions) | ||
| } | ||
|
|
||
| /** | ||
| * For each partitioned file, metadata columns for each record in the file are exactly same. | ||
| * Only update metadata row when `currentFile` is changed. | ||
| */ | ||
| private def updateMetadataRow(): Unit = { | ||
| if (metadataColumns.nonEmpty && currentFile != null) { | ||
| val path = new Path(currentFile.filePath) | ||
| metadataColumns.zipWithIndex.foreach { case (attr, i) => | ||
| attr.name match { | ||
| case FILE_PATH => metadataRow.update(i, UTF8String.fromString(path.toString)) | ||
| case FILE_NAME => metadataRow.update(i, UTF8String.fromString(path.getName)) | ||
| case FILE_SIZE => metadataRow.update(i, currentFile.fileSize) | ||
| case FILE_MODIFICATION_TIME => | ||
| // the modificationTime from the file is in millisecond, | ||
| // while internally, the TimestampType is stored in microsecond | ||
| metadataRow.update(i, currentFile.modificationTime * 1000L) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create a writable column vector containing all required metadata columns | ||
| */ | ||
| private def createMetadataColumnVector(c: ColumnarBatch): Array[WritableColumnVector] = { | ||
| val path = new Path(currentFile.filePath) | ||
| val filePathBytes = path.toString.getBytes | ||
| val fileNameBytes = path.getName.getBytes | ||
| var rowId = 0 | ||
| metadataColumns.map(_.name).map { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should already know how to fill column vector for each metadata column, so the pattern matching can be done outside of execution, and here it does not need to do pattern matching per batch.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. per-batch should be fine to have some small overhead.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I agree, it's not a huge issue as it's per batch not per row. But also think it's not hard to organize code as the most efficient way.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for the comments, really appreciate that! but, we have to do something per batch right? since we cannot be sure of unless we could have a |
||
| case FILE_PATH => | ||
| val columnVector = new OnHeapColumnVector(c.numRows(), StringType) | ||
| rowId = 0 | ||
| // use a tight-loop for better performance | ||
| while (rowId < c.numRows()) { | ||
| columnVector.putByteArray(rowId, filePathBytes) | ||
| rowId += 1 | ||
| } | ||
| columnVector | ||
|
Comment on lines
+164
to
+171
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like for each batch of input rows, we need to recreate new column vector onheap, and write the same constant values per each row (i.e. file path, file name, file size, etc). Just wondering the performance penalty when reading a large table, how big of table have we tested? Maybe a simple optimization here is to come up with something like
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for reviewing! make sense - also Bart mentioned something optimizing |
||
| case FILE_NAME => | ||
| val columnVector = new OnHeapColumnVector(c.numRows(), StringType) | ||
| rowId = 0 | ||
| // use a tight-loop for better performance | ||
| while (rowId < c.numRows()) { | ||
| columnVector.putByteArray(rowId, fileNameBytes) | ||
| rowId += 1 | ||
| } | ||
| columnVector | ||
| case FILE_SIZE => | ||
| val columnVector = new OnHeapColumnVector(c.numRows(), LongType) | ||
| columnVector.putLongs(0, c.numRows(), currentFile.fileSize) | ||
| columnVector | ||
| case FILE_MODIFICATION_TIME => | ||
| val columnVector = new OnHeapColumnVector(c.numRows(), LongType) | ||
| // the modificationTime from the file is in millisecond, | ||
| // while internally, the TimestampType is stored in microsecond | ||
| columnVector.putLongs(0, c.numRows(), currentFile.modificationTime * 1000L) | ||
| columnVector | ||
| }.toArray | ||
| } | ||
|
|
||
| /** | ||
| * Add metadata columns at the end of nextElement if needed. | ||
| * For different row implementations, use different methods to update and append. | ||
| */ | ||
| private def addMetadataColumnsIfNeeded(nextElement: Object): Object = { | ||
| if (metadataColumns.nonEmpty) { | ||
| nextElement match { | ||
| case c: ColumnarBatch => | ||
| new ColumnarBatch( | ||
| Array.tabulate(c.numCols())(c.column) ++ createMetadataColumnVector(c), | ||
cloud-fan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| c.numRows()) | ||
| case u: UnsafeRow => projection.apply(new JoinedRow(u, metadataRow)) | ||
| case i: InternalRow => new JoinedRow(i, metadataRow) | ||
| } | ||
| } else { | ||
| nextElement | ||
| } | ||
| } | ||
|
|
||
| def next(): Object = { | ||
| val nextElement = currentIterator.next() | ||
| // TODO: we should have a better separation of row based and batch based scan, so that we | ||
|
|
@@ -118,7 +225,7 @@ class FileScanRDD( | |
| } | ||
| inputMetrics.incRecordsRead(1) | ||
| } | ||
| nextElement | ||
| addMetadataColumnsIfNeeded(nextElement) | ||
| } | ||
|
|
||
| private def readCurrentFile(): Iterator[InternalRow] = { | ||
|
|
@@ -134,6 +241,7 @@ class FileScanRDD( | |
| private def nextIterator(): Boolean = { | ||
| if (files.hasNext) { | ||
| currentFile = files.next() | ||
| updateMetadataRow() | ||
| logInfo(s"Reading File $currentFile") | ||
| // Sets InputFileBlockHolder for the file block's information | ||
| InputFileBlockHolder.set(currentFile.filePath, currentFile.start, currentFile.length) | ||
|
|
@@ -201,6 +309,7 @@ class FileScanRDD( | |
| } | ||
| } else { | ||
| currentFile = null | ||
| updateMetadataRow() | ||
| InputFileBlockHolder.unset() | ||
| false | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wondering do we also plan to deprecate existing expression
InputFileNamein Spark?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. I think we should, as
InputFileNameis really fragile and can't be used with join for example.