Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.HadoopReadOptions;
import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
Expand Down Expand Up @@ -63,20 +61,10 @@ public static ParquetMetadata readFooter(
.build()
.getMetadataFilter();
}
return readFooter(configuration, file.toPath(), filter);
return readFooter(HadoopInputFile.fromStatus(file.fileStatus(), configuration), filter);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can remove the SKIP_ROW_GROUPS and WITH_ROW_GROUPS now I think, as they are no longer used.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SKIP_ROW_GROUPS is unused now, but WITH_ROW_GROUPS is used by ParquetPartitionReaderFactory.openFileAndReadFooter agg push down case, I removed them and replaced the WITH_ROW_GROUPS with the literal false instead.

}

public static ParquetMetadata readFooter(Configuration configuration,
Path file, ParquetMetadataConverter.MetadataFilter filter) throws IOException {
return readFooter(HadoopInputFile.fromPath(file, configuration), filter);
}

public static ParquetMetadata readFooter(Configuration configuration,
FileStatus fileStatus, ParquetMetadataConverter.MetadataFilter filter) throws IOException {
return readFooter(HadoopInputFile.fromStatus(fileStatus, configuration), filter);
}

private static ParquetMetadata readFooter(HadoopInputFile inputFile,
public static ParquetMetadata readFooter(HadoopInputFile inputFile,
ParquetMetadataConverter.MetadataFilter filter) throws IOException {
ParquetReadOptions readOptions =
HadoopReadOptions.builder(inputFile.getConfiguration(), inputFile.getPath())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.ConfigurationUtil;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Types;

Expand Down Expand Up @@ -89,24 +90,27 @@ public abstract class SpecificParquetRecordReaderBase<T> extends RecordReader<Vo
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
initialize(inputSplit, taskAttemptContext, Option.empty());
initialize(inputSplit, taskAttemptContext, Option.empty(), Option.empty(), Option.empty());
}

public void initialize(
InputSplit inputSplit,
TaskAttemptContext taskAttemptContext,
Option<HadoopInputFile> inputFile,
Option<SeekableInputStream> inputStream,
Option<ParquetMetadata> fileFooter) throws IOException, InterruptedException {
Configuration configuration = taskAttemptContext.getConfiguration();
FileSplit split = (FileSplit) inputSplit;
this.file = split.getPath();
ParquetReadOptions options = HadoopReadOptions
.builder(configuration, file)
.withRange(split.getStart(), split.getStart() + split.getLength())
.build();
ParquetFileReader fileReader;
if (fileFooter.isDefined()) {
fileReader = new ParquetFileReader(configuration, file, fileFooter.get());
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This constructor internally calls HadoopInputFile.fromPath(file, configuration), which produces an unnecessary GetFileInfo RPC

  public static HadoopInputFile fromPath(Path path, Configuration conf) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    return new HadoopInputFile(fs, fs.getFileStatus(path), conf);
  }

if (inputFile.isDefined() && fileFooter.isDefined() && inputStream.isDefined()) {
fileReader = new ParquetFileReader(
inputFile.get(), fileFooter.get(), options, inputStream.get());
} else {
ParquetReadOptions options = HadoopReadOptions
.builder(configuration, file)
.withRange(split.getStart(), split.getStart() + split.getLength())
.build();
fileReader = new ParquetFileReader(
HadoopInputFile.fromPath(file, configuration), options);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
import java.util.List;
import java.util.Set;

import org.apache.spark.SparkUnsupportedOperationException;
import org.apache.spark.sql.catalyst.util.ResolveDefaultColumns;
import scala.Option;
import scala.jdk.javaapi.CollectionConverters;

Expand All @@ -35,11 +33,15 @@
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;

import org.apache.spark.SparkUnsupportedOperationException;
import org.apache.spark.memory.MemoryMode;
import org.apache.spark.sql.catalyst.util.ResolveDefaultColumns;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.execution.vectorized.ColumnVectorUtils;
import org.apache.spark.sql.execution.vectorized.ConstantColumnVector;
Expand Down Expand Up @@ -190,9 +192,11 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont
public void initialize(
InputSplit inputSplit,
TaskAttemptContext taskAttemptContext,
Option<HadoopInputFile> inputFile,
Option<SeekableInputStream> inputStream,
Option<ParquetMetadata> fileFooter)
throws IOException, InterruptedException, UnsupportedOperationException {
super.initialize(inputSplit, taskAttemptContext, fileFooter);
super.initialize(inputSplit, taskAttemptContext, inputFile, inputStream, fileFooter);
initializeInternal();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ object PartitionedFileUtil {
start: Long,
length: Long): PartitionedFile = {
val hosts = getBlockHosts(getBlockLocations(file.fileStatus), start, length)
PartitionedFile(partitionValues, SparkPath.fromPath(filePath), start, length, hosts,
file.getModificationTime, file.getLen, file.metadata)
PartitionedFile(partitionValues, SparkPath.fromPath(filePath), start, length,
file.fileStatus, hosts, file.getModificationTime, file.getLen, file.metadata)
}

private def getBlockLocations(file: FileStatus): Array[BlockLocation] = file match {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,8 @@ object FileFormat {
fieldNames: Seq[String],
filePath: SparkPath,
fileSize: Long,
fileModificationTime: Long): InternalRow = {
fileModificationTime: Long,
fileStatus: FileStatus): InternalRow = {
// When scanning files directly from the filesystem, we only support file-constant metadata
// fields whose values can be derived from a file status. In particular, we don't have accurate
// file split information yet, nor do we have a way to provide custom metadata column values.
Expand All @@ -330,6 +331,7 @@ object FileFormat {
filePath = filePath,
start = 0L,
length = fileSize,
fileStatus = fileStatus,
locations = Array.empty,
modificationTime = fileModificationTime,
fileSize = fileSize,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ class FilePruningRunner(filters: Seq[Expression]) {
boundedFilterMetadataStructOpt.forall { boundedFilter =>
val row =
FileFormat.createMetadataInternalRow(partitionValues, requiredMetadataColumnNames.toSeq,
SparkPath.fromFileStatus(f), f.getLen, f.getModificationTime)
SparkPath.fromFileStatus(f), f.getLen, f.getModificationTime, f)
boundedFilter.eval(row)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources
import java.io.{Closeable, FileNotFoundException}
import java.net.URI

import org.apache.hadoop.fs.Path
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hadoop.hdfs.BlockMissingException
import org.apache.hadoop.security.AccessControlException

Expand Down Expand Up @@ -50,6 +50,7 @@ import org.apache.spark.util.NextIterator
* @param filePath URI of the file to read
* @param start the beginning offset (in bytes) of the block.
* @param length number of bytes to read.
* @param fileStatus The FileStatus instance of the file to read.
* @param modificationTime The modification time of the input file, in milliseconds.
* @param fileSize The length of the input file (not the block), in bytes.
* @param otherConstantMetadataColumnValues The values of any additional constant metadata columns.
Expand All @@ -59,6 +60,7 @@ case class PartitionedFile(
filePath: SparkPath,
start: Long,
length: Long,
fileStatus: FileStatus,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly, due to the addition of fileStatus, the constructor of PartitionedFile can also be further simplified, right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition, since fileStatus will hold more state and also participate in serialization, will this lead to additional memory overhead and serialization pressure?

Copy link
Member Author

@pan3793 pan3793 Sep 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fileStatus should occupy a little bit more memory, but I haven't received OOM issues during the rollout of this change to the online cluster.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cloud-fan Are there also risks of breaking internal APIs with modifications similar to those made here and in FileFormat.createMetadataInternalRow?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the cost of serializing file status?

Copy link
Member Author

@pan3793 pan3793 Sep 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cloud-fan I think path contributes the majority of the size.

public class FileStatus implements Writable, Comparable<Object>,
    Serializable, ObjectInputValidation {
  ...
  private Path path;           // backed by URI
  private long length;
  private Boolean isdir;
  private short block_replication;
  private long blocksize;
  private long modification_time;
  private long access_time;
  private FsPermission permission;
  private String owner;
  private String group;
  private Path symlink;        // likely be NULL
  private Set<AttrFlags> attr; // AttrFlags is enum
  ...
}

public class FsPermission implements Writable, Serializable,
    ObjectInputValidation {
  ...
  private FsAction useraction = null;  // FsAction is enum
  private FsAction groupaction = null;
  private FsAction otheraction = null;
  private Boolean stickyBit = false;
  ...
}

https://github.com/apache/hadoop/blob/branch-3.4.2/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileStatus.java

https://github.com/apache/hadoop/blob/branch-3.4.2/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/FsPermission.java

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to have a custom serde for it and only send the path string? This reminds me of SerializableConfiguration as these Hadoop classes are usually not optimized for serialization and transport.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems not feasible, because FileStatus has many sub-classes, i.e. S3AFileStatus, ViewFsFileStatus

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cloud-fan the change basically moves the RPC cost from executor => storage service, to driver => executors, in my env (HDFS with RBF), the latter is much cheaper than the former. I don't have cloud env, so I can't give numbers for object storage services like S3

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, then this may cause regression for short queries?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm not sure how much difference this will make it terms of driver memory usage. Is it easy to make the FileStatus optional in PartitionedFile and make it controllable via a flag?

It seems in Parquet Java the file status is only used in one case: https://github.com/apache/parquet-java/blob/master/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/util/HadoopInputFile.java#L109-L132

Mostly we just need file path and length. But yea this one use case seems critical to avoid duplicated NN call to get the file status again.

Copy link
Member Author

@pan3793 pan3793 Sep 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sunchao thanks for your suggestion, after an offline discussion with @cloud-fan, I understand his concerns about the overhead of FileStatus, let me summarize the conclusion and my thoughts:

  1. there may be different Hadoop FileSystem implementations, the getFileStatus might be cheap or have executor-side cache in some implementations, but for our case - HDFS with RBF, it's relatively heavy.
  2. there is an upcoming optimization to replace FileStatusCache with PathCache(only carry necessary metadata) on the driver side to reduce driver memory.
  3. @cloud-fan suggests constructing FileStatus from the executor side directly

so, I'm going to split this PR into two parts

  1. I will experiment (3), but I can only do it on HDFS cases (w/ and w/o RBF, w/ and w/o EC)
  2. span the rest of the executor-side changes into a dedicated PR.

@transient locations: Array[String] = Array.empty,
modificationTime: Long = 0L,
fileSize: Long = 0L,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,13 @@ import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hadoop.mapred.FileSplit
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
import org.apache.parquet.HadoopReadOptions
import org.apache.parquet.filter2.compat.FilterCompat
import org.apache.parquet.filter2.predicate.FilterApi
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS
import org.apache.parquet.hadoop._
import org.apache.parquet.hadoop.util.HadoopInputFile

import org.apache.spark.TaskContext
import org.apache.spark.internal.Logging
Expand Down Expand Up @@ -207,15 +210,31 @@ class ParquetFileFormat

val sharedConf = broadcastedHadoopConf.value.value

val fileFooter = if (enableVectorizedReader) {
// When there are vectorized reads, we can avoid reading the footer twice by reading
// all row groups in advance and filter row groups according to filters that require
// push down (no need to read the footer metadata again).
ParquetFooterReader.readFooter(sharedConf, file, ParquetFooterReader.WITH_ROW_GROUPS)
// When there are vectorized reads, we can avoid
// 1. opening the file twice by transfering the SeekableInputStream
// 2. reading the footer twice by reading all row groups in advance and filter row groups
// according to filters that require push down
val metadataFilter = if (enableVectorizedReader) {
HadoopReadOptions.builder(sharedConf, filePath)
.withRange(file.start, file.start + file.length)
.build.getMetadataFilter
} else {
ParquetFooterReader.readFooter(sharedConf, file, ParquetFooterReader.SKIP_ROW_GROUPS)
ParquetMetadataConverter.SKIP_ROW_GROUPS
}

val readOptions = HadoopReadOptions.builder(sharedConf, filePath)
.withMetadataFilter(metadataFilter).build

val inputFile = HadoopInputFile.fromStatus(file.fileStatus, sharedConf)
val inputStream = inputFile.newStream()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to ensure this is properly closed if something goes wrong in the following code?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

risks are low but still possible before transferring the ownership of the inputStream to the vectorizedReader, I wrap the code block with a try finally to ensure inputStream won't leak.

This causes indent change, please view the diff with Hide whitespace

image

val fileReader = ParquetFileReader.open(inputFile, readOptions, inputStream)
val fileFooter = fileReader.getFooter
if (enableVectorizedReader) {
// Keep the file input stream open so it can be reused later
fileReader.detachFileInputStream()
}
fileReader.close()

val footerFileMetaData = fileFooter.getFileMetaData
val datetimeRebaseSpec = DataSourceUtils.datetimeRebaseSpec(
footerFileMetaData.getKeyValueMetaData.get,
Expand Down Expand Up @@ -289,7 +308,8 @@ class ParquetFileFormat
// Instead, we use FileScanRDD's task completion listener to close this iterator.
val iter = new RecordReaderIterator(vectorizedReader)
try {
vectorizedReader.initialize(split, hadoopAttemptContext, Option.apply(fileFooter))
vectorizedReader.initialize(
split, hadoopAttemptContext, Some(inputFile), Some(inputStream), Some(fileFooter))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Although fileFooter should not be null, it's still advisable to use Option(fileFooter) just to be safe

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

addressed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@LuciferYang After second thought, I went back to use Some.

Generally, if something goes wrong, let's fail it immediately rather than letting the illegal state propagate. Here, we should avoid NULL propagation as much as possible.

logDebug(s"Appending $partitionSchema ${file.partitionValues}")
vectorizedReader.initBatch(partitionSchema, file.partitionValues)
if (returningBatch) {
Expand Down Expand Up @@ -447,9 +467,9 @@ object ParquetFileFormat extends Logging {
// Skips row group information since we only need the schema.
// ParquetFileReader.readFooter throws RuntimeException, instead of IOException,
// when it can't read the footer.
Some(new Footer(currentFile.getPath(),
Some(new Footer(currentFile.getPath,
ParquetFooterReader.readFooter(
conf, currentFile, SKIP_ROW_GROUPS)))
HadoopInputFile.fromStatus(currentFile, conf), SKIP_ROW_GROUPS)))
} catch { case e: RuntimeException =>
if (ignoreCorruptFiles) {
logWarning(log"Skipped the footer in the corrupted file: ${MDC(PATH, currentFile)}", e)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,14 @@ import java.time.ZoneId
import org.apache.hadoop.mapred.FileSplit
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
import org.apache.parquet.HadoopReadOptions
import org.apache.parquet.filter2.compat.FilterCompat
import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate}
import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetRecordReader}
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.hadoop._
import org.apache.parquet.hadoop.metadata.{FileMetaData, ParquetMetadata}
import org.apache.parquet.hadoop.util.HadoopInputFile
import org.apache.parquet.io.SeekableInputStream

import org.apache.spark.TaskContext
import org.apache.spark.broadcast.Broadcast
Expand Down Expand Up @@ -86,17 +90,43 @@ case class ParquetPartitionReaderFactory(

private val parquetReaderCallback = new ParquetReaderCallback()

private def getFooter(file: PartitionedFile): ParquetMetadata = {
val conf = broadcastedConf.value.value
if (aggregation.isDefined || enableVectorizedReader) {
// There are two purposes for reading footer with row groups:
// 1. When there are aggregates to push down, we get max/min/count from footer statistics.
// 2. When there are vectorized reads, we can avoid reading the footer twice by reading
// all row groups in advance and filter row groups according to filters that require
// push down (no need to read the footer metadata again).
ParquetFooterReader.readFooter(conf, file, ParquetFooterReader.WITH_ROW_GROUPS)
private def getFooter(file: PartitionedFile):
(Option[HadoopInputFile], Option[SeekableInputStream], ParquetMetadata) = {
val hadoopConf = broadcastedConf.value.value
if (aggregation.isDefined) {
// When there are aggregates to push down, we get max/min/count from footer statistics.
val footer = ParquetFooterReader.readFooter(
hadoopConf, file, ParquetFooterReader.WITH_ROW_GROUPS)
(None, None, footer)
} else {
ParquetFooterReader.readFooter(conf, file, ParquetFooterReader.SKIP_ROW_GROUPS)
// When there are vectorized reads, we can avoid
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we extract this into a new util method in ParquetFooterReader which perhaps returns the footer and also the input stream? we can then avoid duplicating this code in two places.

Copy link
Member Author

@pan3793 pan3793 Sep 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed, please check the updated ParquetFooterReader.

BTW, I don't see special reason to write this file in Java, as I'm going to use Scala data structures (Tuple, Option) in this class, I converted it to Scala.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was When there are vectorized reads... because getFooter has checked enableVectorizedReader. I think this else block handles both non-vectorized and vectorized cases?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, logic is controlled by enableVectorizedReader, comment only describes the vectorized reading optimization. If you think the current comment is confusing, I can update it to mention behavior for non-vectorized reading path too.

// 1. opening the file twice by transfering the SeekableInputStream
// 2. reading the footer twice by reading all row groups in advance and filter row groups
// according to filters that require push down
val metadataFilter = if (enableVectorizedReader) {
HadoopReadOptions.builder(hadoopConf, file.toPath)
.withRange(file.start, file.start + file.length)
.build.getMetadataFilter
} else {
ParquetMetadataConverter.SKIP_ROW_GROUPS
}
val readOptions = HadoopReadOptions.builder(hadoopConf, file.toPath)
.withMetadataFilter(metadataFilter).build

val inputFile = HadoopInputFile.fromStatus(file.fileStatus, hadoopConf)
val inputStream = inputFile.newStream()
val fileReader = ParquetFileReader.open(inputFile, readOptions, inputStream)
val fileFooter = fileReader.getFooter
if (enableVectorizedReader) {
// Keep the file input stream open so it can be reused later
fileReader.detachFileInputStream()
}
fileReader.close()
if (enableVectorizedReader) {
(Some(inputFile), Some(inputStream), fileFooter)
} else {
(None, None, fileFooter)
}
}
}

Expand Down Expand Up @@ -130,7 +160,7 @@ case class ParquetPartitionReaderFactory(
new PartitionReader[InternalRow] {
private var hasNext = true
private lazy val row: InternalRow = {
val footer = getFooter(file)
val (_, _, footer) = getFooter(file)

if (footer != null && footer.getBlocks.size > 0) {
ParquetUtils.createAggInternalRowFromFooter(footer, file.urlEncodedPath,
Expand Down Expand Up @@ -175,7 +205,7 @@ case class ParquetPartitionReaderFactory(
new PartitionReader[ColumnarBatch] {
private var hasNext = true
private val batch: ColumnarBatch = {
val footer = getFooter(file)
val (_, _, footer) = getFooter(file)
if (footer != null && footer.getBlocks.size > 0) {
val row = ParquetUtils.createAggInternalRowFromFooter(footer, file.urlEncodedPath,
dataSchema, partitionSchema, aggregation.get, readDataSchema, file.partitionValues,
Expand Down Expand Up @@ -213,7 +243,7 @@ case class ParquetPartitionReaderFactory(

val filePath = file.toPath
val split = new FileSplit(filePath, file.start, file.length, Array.empty[String])
val fileFooter = getFooter(file)
val (inputFile, inputStream, fileFooter) = getFooter(file)
val footerFileMetaData = fileFooter.getFileMetaData
val datetimeRebaseSpec = getDatetimeRebaseSpec(footerFileMetaData)
// Try to push down filters when filter push-down is enabled.
Expand Down Expand Up @@ -274,7 +304,8 @@ case class ParquetPartitionReaderFactory(
) { reader =>
reader match {
case vectorizedReader: VectorizedParquetRecordReader =>
vectorizedReader.initialize(split, hadoopAttemptContext, Option.apply(fileFooter))
vectorizedReader.initialize(
split, hadoopAttemptContext, inputFile, inputStream, Some(fileFooter))
case _ =>
reader.initialize(split, hadoopAttemptContext)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,10 +286,14 @@ class FileSourceStrategySuite extends QueryTest with SharedSparkSession {

test("Locality support for FileScanRDD") {
val partition = FilePartition(0, Array(
PartitionedFile(InternalRow.empty, sp("fakePath0"), 0, 10, Array("host0", "host1")),
PartitionedFile(InternalRow.empty, sp("fakePath0"), 10, 20, Array("host1", "host2")),
PartitionedFile(InternalRow.empty, sp("fakePath1"), 0, 5, Array("host3")),
PartitionedFile(InternalRow.empty, sp("fakePath2"), 0, 5, Array("host4"))
PartitionedFile(InternalRow.empty, sp("fakePath0"), 0, 10,
new FileStatus(20, false, 3, 0, 0, sp("fakePath0").toPath), Array("host0", "host1")),
PartitionedFile(InternalRow.empty, sp("fakePath0"), 10, 20,
new FileStatus(20, false, 3, 0, 0, sp("fakePath0").toPath), Array("host1", "host2")),
PartitionedFile(InternalRow.empty, sp("fakePath1"), 0, 5,
new FileStatus(5, false, 3, 0, 0, sp("fakePath1").toPath), Array("host3")),
PartitionedFile(InternalRow.empty, sp("fakePath2"), 0, 5,
new FileStatus(5, false, 3, 0, 0, sp("fakePath2").toPath), Array("host4"))
))

val fakeRDD = new FileScanRDD(
Expand Down Expand Up @@ -605,8 +609,10 @@ class FileSourceStrategySuite extends QueryTest with SharedSparkSession {
}

test(s"SPARK-44021: Test ${SQLConf.FILES_MAX_PARTITION_NUM.key} works as expected") {
val files =
Range(0, 300000).map(p => PartitionedFile(InternalRow.empty, sp(s"$p"), 0, 50000000))
val files = Range(0, 300000).map { p =>
PartitionedFile(InternalRow.empty, sp(s"$p"), 0, 50000000,
new FileStatus(0, false, 1, 0, 0, sp(s"$p").toPath))
}
val maxPartitionBytes = conf.filesMaxPartitionBytes
val defaultPartitions = FilePartition.getFilePartitions(spark, files, maxPartitionBytes)
assert(defaultPartitions.size === 150000)
Expand Down
Loading