linkedin · rzhang10 · Aug 1, 2022 · Jul 27, 2022 · Jul 28, 2022 · aastha25
diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java b/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java
@@ -97,9 +97,13 @@ public CloseableIterator<T> iterator() {
       fileSchemaWithIds = ORCSchemaUtil.applyNameMapping(fileSchema, nameMapping);
     }
     readOrcSchema = ORCSchemaUtil.buildOrcProjection(schema, fileSchemaWithIds);
+    // If the projected ORC schema is an empty struct, it means we are only projecting columns
+    // with default values that aren't existent in previously written files, and thus we won't need
+    // to push down filters to ORC's SearchArgument, since we are not reading anything from files at all
+    boolean isEmptyStruct = readOrcSchema.getChildren().size() == 0;
 
     SearchArgument sarg = null;
-    if (filter != null) {
+    if (filter != null && !isEmptyStruct) {
       Expression boundFilter = Binder.bind(schema.asStruct(), filter, caseSensitive);
       sarg = ExpressionToSearchArgument.convert(boundFilter, readOrcSchema);
     }

diff --git a/.../test/java/org/apache/iceberg/spark/data/TestSparkOrcReaderForFieldsWithDefaultValue.java b/.../test/java/org/apache/iceberg/spark/data/TestSparkOrcReaderForFieldsWithDefaultValue.java
@@ -27,6 +27,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.iceberg.Files;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.expressions.Expressions;
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.orc.ORC;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
@@ -151,6 +152,75 @@ public void testOrcDefaultValues() throws IOException {
     }
   }
 
+  @Test
+  public void testSelectionAndFilterWithDefaultValueColumnOnly() throws IOException {
+    final int numRows = 10;
+
+    final InternalRow expectedFirstRow = new GenericInternalRow(1);
+    // expectedFirstRow.update(0, 0);
+    expectedFirstRow.update(0, UTF8String.fromString("foo"));
+
+    TypeDescription orcSchema =
+        TypeDescription.fromString("struct<col1:int>");
+
+    Schema readSchema = new Schema(
+        Types.NestedField.required(2, "col2", Types.StringType.get(), "foo", null)
+    );
+
+    Configuration conf = new Configuration();
+
+    File orcFile = temp.newFile();
+    Path orcFilePath = new Path(orcFile.getPath());
+
+    Writer writer = OrcFile.createWriter(orcFilePath,
+        OrcFile.writerOptions(conf).setSchema(orcSchema).overwrite(true));
+
+    VectorizedRowBatch batch = orcSchema.createRowBatch();
+    LongColumnVector firstCol = (LongColumnVector) batch.cols[0];
+    for (int r = 0; r < numRows; ++r) {
+      int row = batch.size++;
+      firstCol.vector[row] = r;
+      // If the batch is full, write it out and start over.
+      if (batch.size == batch.getMaxSize()) {
+        writer.addRowBatch(batch);
+        batch.reset();
+      }
+    }
+    if (batch.size != 0) {
+      writer.addRowBatch(batch);
+      batch.reset();
+    }
+    writer.close();
+
+    // try to read the data using the readSchema, which is an evolved
+    // schema that contains a new column with default value
+
+    // non-vectorized read
+    try (CloseableIterable<InternalRow> reader = ORC.read(Files.localInput(orcFile))
+        .project(readSchema)
+        .filter(Expressions.equal("col2", "foo"))
+        .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema))
+        .build()) {
+      final Iterator<InternalRow> actualRows = reader.iterator();
+      final InternalRow actualFirstRow = actualRows.next();
+
+      assertEquals(readSchema, expectedFirstRow, actualFirstRow);
+    }
+
+    // vectorized-read
+    try (CloseableIterable<ColumnarBatch> reader = ORC.read(Files.localInput(orcFile))
+        .project(readSchema)
+        .filter(Expressions.equal("col2", "foo"))
+        .createBatchedReaderFunc(readOrcSchema ->
+            VectorizedSparkOrcReaders.buildReader(readSchema, readOrcSchema, ImmutableMap.of()))
+        .build()) {
+      final Iterator<InternalRow> actualRows = batchesToRows(reader.iterator());
+      final InternalRow actualFirstRow = actualRows.next();
+
+      assertEquals(readSchema, expectedFirstRow, actualFirstRow);
+    }
+  }
+
   private Iterator<InternalRow> batchesToRows(Iterator<ColumnarBatch> batches) {
     return Iterators.concat(Iterators.transform(batches, ColumnarBatch::rowIterator));
   }