apache · sunchao · Jun 15, 2021 · Jun 16, 2021 · Jun 17, 2021 · Jun 17, 2021
diff --git a/...re/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetReadState.java b/...re/src/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetReadState.java
@@ -17,13 +17,38 @@
 
 package org.apache.spark.sql.execution.datasources.parquet;
 
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.PrimitiveIterator;
+
 /**
  * Helper class to store intermediate state while reading a Parquet column chunk.
  */
 final class ParquetReadState {
-  /** Maximum definition level */
+  /** A special row range used when there is no row indexes (hence all rows must be included) */
+  private static final RowRange MAX_ROW_RANGE = new RowRange(Long.MIN_VALUE, Long.MAX_VALUE);
+
+  /**
+   * A special row range used when the row indexes are present AND all the row ranges have been
+   * processed. This serves as a sentinel at the end indicating that all rows come after the last
+   * row range should be skipped.
+   */
+  private static final RowRange END_ROW_RANGE = new RowRange(Long.MAX_VALUE, Long.MIN_VALUE);
+
+  /** Iterator over all row ranges, only not-null if column index is present */
+  private final Iterator<RowRange> rowRanges;
+
+  /** The current row range */
+  private RowRange currentRange;
+
+  /** Maximum definition level for the Parquet column */
   final int maxDefinitionLevel;
 
+  /** The current index over all rows within the column chunk. This is used to check if the
+   * current row should be skipped by comparing against the row ranges. */
+  long rowId;
+
   /** The offset in the current batch to put the next value */
   int offset;
 
@@ -33,31 +58,108 @@ final class ParquetReadState {
   /** The remaining number of values to read in the current batch */
   int valuesToReadInBatch;
 
-  ParquetReadState(int maxDefinitionLevel) {
+  ParquetReadState(int maxDefinitionLevel, PrimitiveIterator.OfLong rowIndexes) {
     this.maxDefinitionLevel = maxDefinitionLevel;
+    this.rowRanges = constructRanges(rowIndexes);
+    nextRange();
   }
 
   /**
-   * Called at the beginning of reading a new batch.
+   * Construct a list of row ranges from the given `rowIndexes`. For example, suppose the
+   * `rowIndexes` are `[0, 1, 2, 4, 5, 7, 8, 9]`, it will be converted into 3 row ranges:
+   * `[0-2], [4-5], [7-9]`.
    */
-  void resetForBatch(int batchSize) {
+  private Iterator<RowRange> constructRanges(PrimitiveIterator.OfLong rowIndexes) {
+    if (rowIndexes == null) {
+      return null;
+    }
+
+    List<RowRange> rowRanges = new ArrayList<>();
+    long currentStart = Long.MIN_VALUE;
+    long previous = Long.MIN_VALUE;
+
+    while (rowIndexes.hasNext()) {
+      long idx = rowIndexes.nextLong();
+      if (currentStart == Long.MIN_VALUE) {
+        currentStart = idx;
+      } else if (previous + 1 != idx) {
+        RowRange range = new RowRange(currentStart, previous);
+        rowRanges.add(range);
+        currentStart = idx;
+      }
+      previous = idx;
+    }
+
+    if (previous != Long.MIN_VALUE) {
+      rowRanges.add(new RowRange(currentStart, previous));
+    }
+
+    return rowRanges.iterator();
+  }
+
+  /**
+   * Must be called at the beginning of reading a new batch.
+   */
+  void resetForNewBatch(int batchSize) {
     this.offset = 0;
     this.valuesToReadInBatch = batchSize;
   }
 
   /**
-   * Called at the beginning of reading a new page.
+   * Must be called at the beginning of reading a new page.
    */
-  void resetForPage(int totalValuesInPage) {
+  void resetForNewPage(int totalValuesInPage, long pageFirstRowIndex) {
     this.valuesToReadInPage = totalValuesInPage;
+    this.rowId = pageFirstRowIndex;
   }
 
   /**
-   * Advance the current offset to the new values.
+   * Returns the start index of the current row range.
    */
-  void advanceOffset(int newOffset) {
+  long currentRangeStart() {
+    return currentRange.start;
+  }
+
+  /**
+   * Returns the end index of the current row range.
+   */
+  long currentRangeEnd() {
+    return currentRange.end;
+  }
+
+  /**
+   * Advance the current offset and rowId to the new values.
+   */
+  void advanceOffsetAndRowId(int newOffset, long newRowId) {
     valuesToReadInBatch -= (newOffset - offset);
-    valuesToReadInPage -= (newOffset - offset);
+    valuesToReadInPage -= (newRowId - rowId);
     offset = newOffset;
+    rowId = newRowId;
+  }
+
+  /**
+   * Advance to the next range.
+   */
+  void nextRange() {
+    if (rowRanges == null) {
+      currentRange = MAX_ROW_RANGE;
+    } else if (!rowRanges.hasNext()) {
+      currentRange = END_ROW_RANGE;
+    } else {
+      currentRange = rowRanges.next();
+    }
+  }
+
+  /**
+   * Helper struct to represent a range of row indexes `[start, end]`.
+   */
+  private static class RowRange {
+    final long start;
+    final long end;
+
+    RowRange(long start, long end) {
+      this.start = start;
+      this.end = end;
+    }
   }
 }
diff --git a/...rc/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetVectorUpdater.java b/...rc/main/java/org/apache/spark/sql/execution/datasources/parquet/ParquetVectorUpdater.java
@@ -30,20 +30,28 @@ public interface ParquetVectorUpdater {
    * @param values destination values vector
    * @param valuesReader reader to read values from
    */
-  void updateBatch(
+  void readValues(
       int total,
       int offset,
       WritableColumnVector values,
       VectorizedValuesReader valuesReader);
 
+  /**
+   * Skip a batch of `total` values from `valuesReader`.
+   *
+   * @param total total number of values to skip
+   * @param valuesReader reader to skip values from
+   */
+  void skipValues(int total, VectorizedValuesReader valuesReader);
+
   /**
    * Read a single value from `valuesReader` into `values`, at `offset`.
    *
    * @param offset offset in `values` to put the new value
    * @param values destination value vector
    * @param valuesReader reader to read values from
    */
-  void update(int offset, WritableColumnVector values, VectorizedValuesReader valuesReader);
+  void readValue(int offset, WritableColumnVector values, VectorizedValuesReader valuesReader);
 
   /**
    * Process a batch of `total` values starting from `offset` in `values`, whose null slots