Fix for many blocks per split (apache#82)

pwoody · robert3005 · commit f9fc3a6b66e9 · 2016-12-15T10:43:07.000-05:00
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java
@@ -107,18 +107,19 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont
       // then we need to apply the predicate push down filter
       footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
       FilterCompat.Filter filter = getFilter(configuration);
-      this.reader = ParquetFileReader.open(configuration, file, footer);
-      List<RowGroupFilter.FilterLevel> filterLevels =
-              ImmutableList.of(RowGroupFilter.FilterLevel.STATISTICS);
-      if (configuration.getBoolean(DICTIONARY_FILTERING_ENABLED, false)) {
-        filterLevels = ImmutableList.of(RowGroupFilter.FilterLevel.STATISTICS,
-                RowGroupFilter.FilterLevel.DICTIONARY);
+      try (ParquetFileReader reader = ParquetFileReader.open(configuration, file, footer)) {
+        List<RowGroupFilter.FilterLevel> filterLevels =
+                ImmutableList.of(RowGroupFilter.FilterLevel.STATISTICS);
+        if (configuration.getBoolean(DICTIONARY_FILTERING_ENABLED, false)) {
+          filterLevels = ImmutableList.of(RowGroupFilter.FilterLevel.STATISTICS,
+                  RowGroupFilter.FilterLevel.DICTIONARY);
+        }
+        blocks = filterRowGroups(
+                filterLevels,
+                filter,
+                footer.getBlocks(),
+                reader);
       }
-      blocks = filterRowGroups(
-              filterLevels,
-              filter,
-              footer.getBlocks(),
-              reader);
     } else {
       // otherwise we find the row groups that were selected on the client
       footer = readFooter(configuration, file, NO_FILTER);
@@ -147,22 +148,19 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont
                 + " out of: " + Arrays.toString(foundRowGroupOffsets)
                 + " in range " + split.getStart() + ", " + split.getEnd());
       }
-      this.reader = new ParquetFileReader(configuration, file, footer);
     }
     this.fileSchema = footer.getFileMetaData().getSchema();
     Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData();
     ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
     ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
         taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema));
     this.requestedSchema = readContext.getRequestedSchema();
-    reader.setRequestedSchema(requestedSchema);
+    this.reader = ParquetFileReader.open(configuration, file, new ParquetMetadata(footer.getFileMetaData(), blocks));
+    this.reader.setRequestedSchema(requestedSchema);
     String sparkRequestedSchemaString =
         configuration.get(ParquetReadSupport$.MODULE$.SPARK_ROW_REQUESTED_SCHEMA());
     this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString);
-    for (BlockMetaData block : blocks) {
-      this.totalRowCount += block.getRowCount();
-    }
-
+    this.totalRowCount = this.reader.getRecordCount();
     // For test purpose.
     // If the predefined accumulator exists, the row group number to read will be updated
     // to the accumulator. So we can check if the row groups are filtered or not in test case.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -743,6 +743,20 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       assert(option.compressionCodecClassName == "UNCOMPRESSED")
     }
   }
+
+  test("Pruned blocks within a file split do not get used") {
+    withSQLConf(ParquetOutputFormat.BLOCK_SIZE -> "1",
+      SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
+      withTempPath { f =>
+        // Create many blocks that will fit within the maxSplitSize.
+        // Ensure that non-contiguous blocks properly get removed within the vectorized reader.
+        val data = sparkContext.parallelize((1 to 100) ++ (100 to 200) ++ (1 to 100), 1).toDF()
+        data.write.parquet(f.getCanonicalPath)
+        val df = spark.read.parquet(f.getCanonicalPath)
+        assert(df.filter("value <=> 1").count() == 2)
+      }
+    }
+  }
 }
 
 class JobCommitFailureParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)