diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java b/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java index 55737f2b50..5606e8cd58 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java @@ -97,9 +97,13 @@ public CloseableIterator iterator() { fileSchemaWithIds = ORCSchemaUtil.applyNameMapping(fileSchema, nameMapping); } readOrcSchema = ORCSchemaUtil.buildOrcProjection(schema, fileSchemaWithIds); + // If the projected ORC schema is an empty struct, it means we are only projecting columns + // with default values that aren't existent in previously written files, and thus we won't need + // to push down filters to ORC's SearchArgument, since we are not reading anything from files at all + boolean isEmptyStruct = readOrcSchema.getChildren().size() == 0; SearchArgument sarg = null; - if (filter != null) { + if (filter != null && !isEmptyStruct) { Expression boundFilter = Binder.bind(schema.asStruct(), filter, caseSensitive); sarg = ExpressionToSearchArgument.convert(boundFilter, readOrcSchema); } diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReaderForFieldsWithDefaultValue.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReaderForFieldsWithDefaultValue.java index adde388038..6e3ec70f82 100644 --- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReaderForFieldsWithDefaultValue.java +++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReaderForFieldsWithDefaultValue.java @@ -27,6 +27,7 @@ import org.apache.hadoop.fs.Path; import org.apache.iceberg.Files; import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.orc.ORC; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; @@ -151,6 +152,75 @@ public void testOrcDefaultValues() throws IOException { } } + @Test + public void testSelectionAndFilterWithDefaultValueColumnOnly() throws IOException { + final int numRows = 10; + + final InternalRow expectedFirstRow = new GenericInternalRow(1); + // expectedFirstRow.update(0, 0); + expectedFirstRow.update(0, UTF8String.fromString("foo")); + + TypeDescription orcSchema = + TypeDescription.fromString("struct"); + + Schema readSchema = new Schema( + Types.NestedField.required(2, "col2", Types.StringType.get(), "foo", null) + ); + + Configuration conf = new Configuration(); + + File orcFile = temp.newFile(); + Path orcFilePath = new Path(orcFile.getPath()); + + Writer writer = OrcFile.createWriter(orcFilePath, + OrcFile.writerOptions(conf).setSchema(orcSchema).overwrite(true)); + + VectorizedRowBatch batch = orcSchema.createRowBatch(); + LongColumnVector firstCol = (LongColumnVector) batch.cols[0]; + for (int r = 0; r < numRows; ++r) { + int row = batch.size++; + firstCol.vector[row] = r; + // If the batch is full, write it out and start over. + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + } + if (batch.size != 0) { + writer.addRowBatch(batch); + batch.reset(); + } + writer.close(); + + // try to read the data using the readSchema, which is an evolved + // schema that contains a new column with default value + + // non-vectorized read + try (CloseableIterable reader = ORC.read(Files.localInput(orcFile)) + .project(readSchema) + .filter(Expressions.equal("col2", "foo")) + .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema)) + .build()) { + final Iterator actualRows = reader.iterator(); + final InternalRow actualFirstRow = actualRows.next(); + + assertEquals(readSchema, expectedFirstRow, actualFirstRow); + } + + // vectorized-read + try (CloseableIterable reader = ORC.read(Files.localInput(orcFile)) + .project(readSchema) + .filter(Expressions.equal("col2", "foo")) + .createBatchedReaderFunc(readOrcSchema -> + VectorizedSparkOrcReaders.buildReader(readSchema, readOrcSchema, ImmutableMap.of())) + .build()) { + final Iterator actualRows = batchesToRows(reader.iterator()); + final InternalRow actualFirstRow = actualRows.next(); + + assertEquals(readSchema, expectedFirstRow, actualFirstRow); + } + } + private Iterator batchesToRows(Iterator batches) { return Iterators.concat(Iterators.transform(batches, ColumnarBatch::rowIterator)); }