@@ -19,19 +19,21 @@ package org.apache.spark.sql.execution.datasources.parquet
1919
2020import java .io .FileNotFoundException
2121import java .net .URI
22+ import java .util .concurrent .{Callable , TimeUnit }
2223import java .util .logging .{Logger => JLogger }
2324
2425import scala .collection .JavaConverters ._
2526import scala .collection .mutable
2627import scala .util .{Failure , Try }
2728
29+ import com .google .common .cache .{Cache , CacheBuilder , RemovalListener , RemovalNotification }
2830import org .apache .hadoop .conf .Configuration
2931import org .apache .hadoop .fs .{FileStatus , Path }
3032import org .apache .hadoop .mapreduce ._
3133import org .apache .hadoop .mapreduce .lib .input .FileSplit
3234import org .apache .hadoop .mapreduce .task .TaskAttemptContextImpl
3335import org .apache .parquet .{Log => ApacheParquetLog }
34- import org .apache .parquet .filter2 .compat .{ FilterCompat , RowGroupFilter }
36+ import org .apache .parquet .filter2 .compat .FilterCompat
3537import org .apache .parquet .filter2 .predicate .FilterApi
3638import org .apache .parquet .format .converter .ParquetMetadataConverter
3739import org .apache .parquet .hadoop ._
@@ -278,75 +280,47 @@ class ParquetFileFormat
278280 true
279281 }
280282
281- override def getSplits (
282- sparkSession : SparkSession ,
283- fileIndex : FileIndex ,
284- fileStatus : FileStatus ,
285- filters : Seq [Filter ],
286- schema : StructType ,
287- hadoopConf : Configuration ): Seq [FileSplit ] = {
288- if (filters.isEmpty || ! sparkSession.sessionState.conf.parquetPartitionPruningEnabled) {
283+ override def buildSplitter (
284+ sparkSession : SparkSession ,
285+ fileIndex : FileIndex ,
286+ filters : Seq [Filter ],
287+ schema : StructType ,
288+ hadoopConf : Configuration ): (FileStatus => Seq [FileSplit ]) = {
289+ val pruningEnabled = sparkSession.sessionState.conf.parquetPartitionPruningEnabled
290+ val defaultSplitter = super .buildSplitter(sparkSession, fileIndex, filters, schema, hadoopConf)
291+ if (! pruningEnabled || filters.isEmpty) {
289292 // Return immediately to save FileSystem overhead
290- super .getSplits(sparkSession, fileIndex, fileStatus, filters, schema, hadoopConf)
293+ defaultSplitter
291294 } else {
292- val filePath = fileStatus.getPath
293- val rootOption : Option [Path ] = fileIndex.rootPaths
294- .find(root => filePath.toString.startsWith(root.toString))
295- val metadataOption = rootOption.flatMap { root =>
296- cachedMetadata.get(root).orElse(getMetadataForPath(filePath, root, hadoopConf))
297- .map { metadata =>
298- cachedMetadata.put(root, metadata)
299- metadata
300- }
295+ val splitters = fileIndex.rootPaths.map { root =>
296+ val splits = ParquetFileFormat .fileSplits.get(root,
297+ new Callable [ParquetFileSplitter ] {
298+ override def call (): ParquetFileSplitter =
299+ createParquetFileSplits(root, hadoopConf, schema)
300+ })
301+ root -> splits.buildSplitter(filters)
302+ }.toMap
303+ val compositeSplitter : (FileStatus => Seq [FileSplit ]) = { stat =>
304+ val filePath = stat.getPath
305+ val rootOption : Option [Path ] = fileIndex.rootPaths
306+ .find(root => filePath.toString.startsWith(root.toString))
307+ val splitterForPath = rootOption.flatMap(splitters.get).getOrElse(defaultSplitter)
308+ splitterForPath(stat)
301309 }
302- // If the metadata exists, filter the splits.
303- // Otherwise, fall back to the default implementation.
304- metadataOption
305- .map(filterToSplits(fileStatus, _, rootOption.get, filters, schema, hadoopConf))
306- .getOrElse(super .getSplits(sparkSession, fileIndex, fileStatus,
307- filters, schema, hadoopConf))
310+ compositeSplitter
308311 }
309312 }
310313
311- private def filterToSplits (
312- fileStatus : FileStatus ,
313- metadata : ParquetMetadata ,
314- metadataRoot : Path ,
315- filters : Seq [Filter ],
316- schema : StructType ,
317- hadoopConf : Configuration ): Seq [FileSplit ] = {
318- val metadataBlocks = metadata.getBlocks
319-
320- // Ensure that the metadata has an entry for the file.
321- // If it does not, do not filter at this stage.
322- val metadataContainsPath = metadataBlocks.asScala.exists { bmd =>
323- new Path (metadataRoot, bmd.getPath) == fileStatus.getPath
324- }
325- if (! metadataContainsPath) {
326- log.warn(s " Found _metadata file for $metadataRoot, " +
327- s " but no entries for blocks in ${fileStatus.getPath}. Retaining whole file. " )
328- return Seq (new FileSplit (fileStatus.getPath, 0 , fileStatus.getLen, Array .empty))
329- }
330-
331- val parquetSchema = metadata.getFileMetaData.getSchema
332- val filter = FilterCompat .get(filters
333- .flatMap(ParquetFilters .createFilter(schema, _))
334- .reduce(FilterApi .and))
335- val filteredMetadata =
336- RowGroupFilter .filterRowGroups(filter, metadataBlocks, parquetSchema).asScala
337- filteredMetadata.flatMap { bmd =>
338- val bmdPath = new Path (metadataRoot, bmd.getPath)
339- val fsPath = fileStatus.getPath
340- if (bmdPath == fsPath) {
341- Some (new FileSplit (bmdPath, bmd.getStartingPos, bmd.getTotalByteSize, Array .empty))
342- } else {
343- None
344- }
345- }
314+ private def createParquetFileSplits (
315+ root : Path ,
316+ hadoopConf : Configuration ,
317+ schema : StructType ): ParquetFileSplitter = {
318+ getMetadataForPath(root, hadoopConf)
319+ .map(meta => new ParquetMetadataFileSplitter (root, meta.getBlocks.asScala, schema))
320+ .getOrElse(ParquetDefaultFileSplitter )
346321 }
347322
348323 private def getMetadataForPath (
349- filePath : Path ,
350324 rootPath : Path ,
351325 conf : Configuration ): Option [ParquetMetadata ] = {
352326 val fs = rootPath.getFileSystem(conf)
@@ -523,6 +497,21 @@ class ParquetFileFormat
523497}
524498
525499object ParquetFileFormat extends Logging {
500+
501+ @ transient private val fileSplits : Cache [Path , ParquetFileSplitter ] =
502+ CacheBuilder .newBuilder()
503+ .expireAfterAccess(4 , TimeUnit .HOURS )
504+ .concurrencyLevel(1 )
505+ .softValues()
506+ .removalListener(new RemovalListener [Path , ParquetFileSplitter ] {
507+ override def onRemoval (removalNotification :
508+ RemovalNotification [Path , ParquetFileSplitter ]): Unit = {
509+ val path = removalNotification.getKey
510+ log.info(s " Removing value for path $path from cache, " +
511+ s " cause: ${removalNotification.getCause}" )
512+ }
513+ }).build()
514+
526515 private [parquet] def readSchema (
527516 footers : Seq [Footer ], sparkSession : SparkSession ): Option [StructType ] = {
528517
0 commit comments