From 83ae0777463a6050693a976840eb8a244a1967fa Mon Sep 17 00:00:00 2001 From: geruh Date: Mon, 15 Sep 2025 00:16:42 -0700 Subject: [PATCH 1/9] Fix variant type filtering in ParquetMetricsRowGroupFilter --- .../data/TestMetricsRowGroupFilter.java | 96 +++++++++++++++++++ .../parquet/ParquetMetricsRowGroupFilter.java | 6 +- 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index 384dcacd10cf..a475fecf004b 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -62,6 +62,7 @@ import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.data.orc.GenericOrcReader; import org.apache.iceberg.data.orc.GenericOrcWriter; +import org.apache.iceberg.data.parquet.GenericParquetWriter; import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.io.CloseableIterable; @@ -78,6 +79,10 @@ import org.apache.iceberg.types.Types.FloatType; import org.apache.iceberg.types.Types.IntegerType; import org.apache.iceberg.types.Types.StringType; +import org.apache.iceberg.variants.ShreddedObject; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.Variants; import org.apache.orc.OrcFile; import org.apache.orc.Reader; import org.apache.parquet.hadoop.ParquetFileReader; @@ -988,6 +993,97 @@ public void testTransformFilter() { .isTrue(); } + @TestTemplate + public void testVariantFilterNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + Schema variantSchema = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "variant_field", Types.VariantType.get())); + + File parquetFile = new File(tempDir, "test-variant" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender appender = + Parquet.write(outFile) + .schema(variantSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(variantSchema); + record.setField("id", i); + + if (i % 2 == 0) { + VariantMetadata metadata = Variants.metadata("field"); + ShreddedObject obj = Variants.object(metadata); + obj.put("field", Variants.of("value" + i)); + Variant variant = Variant.of(metadata, obj); + record.setField("variant_field", variant); + } + + appender.add(record); + } + } + + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + + ParquetMetricsRowGroupFilter rowGroupFilter = + new ParquetMetricsRowGroupFilter(variantSchema, notNull("variant_field"), true); + boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan") + .isTrue(); + } + parquetFile.deleteOnExit(); + } + + @TestTemplate + public void testAllNullsVariantNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + Schema variantSchema = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "variant_field", Types.VariantType.get())); + + File parquetFile = new File(tempDir, "test-variant-nulls" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender appender = + Parquet.write(outFile) + .schema(variantSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(variantSchema); + record.setField("id", i); + record.setField("variant_field", null); + appender.add(record); + } + } + + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + + ParquetMetricsRowGroupFilter rowGroupFilter = + new ParquetMetricsRowGroupFilter(variantSchema, notNull("variant_field"), true); + boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan even for all nulls") + .isTrue(); + } + parquetFile.deleteOnExit(); + } + private boolean shouldRead(Expression expression) { return shouldRead(expression, true); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index 1ad346d39ab7..40ff9048d173 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -157,8 +157,10 @@ public Boolean notNull(BoundReference ref) { // When filtering nested types notNull() is implicit filter passed even though complex // filters aren't pushed down in Parquet. Leave all nested column type filters to be - // evaluated post scan. - if (schema.findType(id) instanceof Type.NestedType) { + // evaluated post scan. Variant types also need to be evaluated post scan to access + // shredded statistics. + Type type = schema.findType(id); + if (type instanceof Type.NestedType || type.isVariantType()) { return ROWS_MIGHT_MATCH; } From 1d41313c0f99899f2e4adbe8c9b01b97f5537d1a Mon Sep 17 00:00:00 2001 From: geruh Date: Tue, 16 Sep 2025 00:25:39 -0700 Subject: [PATCH 2/9] Address comments and add more tests --- .../data/TestMetricsRowGroupFilter.java | 252 +++++++++++------- .../parquet/ParquetMetricsRowGroupFilter.java | 7 +- .../iceberg/spark/SparkTestHelperBase.java | 7 + .../iceberg/spark/sql/TestFilterPushDown.java | 74 ++++- 4 files changed, 234 insertions(+), 106 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index a475fecf004b..9b26168e4904 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -18,7 +18,6 @@ */ package org.apache.iceberg.data; -import static org.apache.iceberg.avro.AvroSchemaUtil.convert; import static org.apache.iceberg.expressions.Expressions.and; import static org.apache.iceberg.expressions.Expressions.equal; import static org.apache.iceberg.expressions.Expressions.greaterThan; @@ -49,8 +48,6 @@ import java.util.Arrays; import java.util.List; import java.util.UUID; -import org.apache.avro.generic.GenericData.Record; -import org.apache.avro.generic.GenericRecordBuilder; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.iceberg.FileFormat; @@ -59,7 +56,6 @@ import org.apache.iceberg.ParameterizedTestExtension; import org.apache.iceberg.Parameters; import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.data.orc.GenericOrcReader; import org.apache.iceberg.data.orc.GenericOrcWriter; import org.apache.iceberg.data.parquet.GenericParquetWriter; @@ -73,6 +69,7 @@ import org.apache.iceberg.orc.ORC; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.parquet.ParquetMetricsRowGroupFilter; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.DoubleType; @@ -159,12 +156,16 @@ public static List parameters() { private File orcFile = null; private MessageType parquetSchema = null; private BlockMetaData rowGroupMetadata = null; + private Schema testSchema; + private Schema fileSchema; @Parameter private FileFormat format; @TempDir private File tempDir; @BeforeEach public void createInputFile() throws IOException { + this.testSchema = schemaForFormat(format); + this.fileSchema = fileSchemaForFormat(format); switch (format) { case ORC: createOrcInputFile(); @@ -178,16 +179,42 @@ public void createInputFile() throws IOException { } } + private static Schema schemaForFormat(FileFormat format) { + if (format == FileFormat.PARQUET) { + return new Schema( + Lists.newArrayList( + Iterables.concat( + SCHEMA.columns(), + Lists.newArrayList( + optional(18, "some_nulls_variant", Types.VariantType.get()), + optional(19, "all_nulls_variant", Types.VariantType.get()))))); + } + return SCHEMA; + } + + private static Schema fileSchemaForFormat(FileFormat format) { + if (format == FileFormat.PARQUET) { + return new Schema( + Lists.newArrayList( + Iterables.concat( + FILE_SCHEMA.columns(), + Lists.newArrayList( + optional(18, "_some_nulls_variant", Types.VariantType.get()), + optional(19, "_all_nulls_variant", Types.VariantType.get()))))); + } + return FILE_SCHEMA; + } + public void createOrcInputFile() throws IOException { this.orcFile = new File(tempDir, "junit" + System.nanoTime()); OutputFile outFile = Files.localOutput(orcFile); try (FileAppender appender = ORC.write(outFile) - .schema(FILE_SCHEMA) + .schema(fileSchema) .createWriterFunc(GenericOrcWriter::buildWriter) .build()) { - GenericRecord record = GenericRecord.create(FILE_SCHEMA); + GenericRecord record = GenericRecord.create(fileSchema); // create 50 records for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { record.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 @@ -227,35 +254,46 @@ public void createOrcInputFile() throws IOException { private void createParquetInputFile() throws IOException { File parquetFile = new File(tempDir, "junit" + System.nanoTime()); - // build struct field schema - org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(UNDERSCORE_STRUCT_FIELD_TYPE); - OutputFile outFile = Files.localOutput(parquetFile); - try (FileAppender appender = Parquet.write(outFile).schema(FILE_SCHEMA).build()) { - GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); + try (FileAppender appender = + Parquet.write(outFile) + .schema(fileSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { // create 50 records for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { - builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 - builder.set( + GenericRecord record = GenericRecord.create(fileSchema); + record.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 + record.setField( "_no_stats_parquet", TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats // in Parquet - builder.set("_required", "req"); // required, always non-null - builder.set("_all_nulls", null); // never non-null - builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values - builder.set("_no_nulls", ""); // optional, but always non-null - builder.set("_all_nans", Double.NaN); // never non-nan - builder.set("_some_nans", (i % 10 == 0) ? Float.NaN : 2F); // includes some nan values - builder.set( + record.setField("_required", "req"); // required, always non-null + record.setField("_all_nulls", null); // never non-null + record.setField("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values + record.setField("_no_nulls", ""); // optional, but always non-null + record.setField("_all_nans", Double.NaN); // never non-nan + record.setField("_some_nans", (i % 10 == 0) ? Float.NaN : 2F); // includes some nan values + record.setField( "_some_double_nans", (i % 10 == 0) ? Double.NaN : 2D); // includes some nan values - builder.set("_no_nans", 3D); // optional, but always non-nan - builder.set("_str", i + "str" + i); + record.setField("_no_nans", 3D); // optional, but always non-nan + record.setField("_str", i + "str" + i); - Record structNotNull = new Record(structSchema); - structNotNull.put("_int_field", INT_MIN_VALUE + i); - builder.set("_struct_not_null", structNotNull); // struct with int + GenericRecord structNotNull = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE); + structNotNull.setField("_int_field", INT_MIN_VALUE + i); + record.setField("_struct_not_null", structNotNull); // struct with int + + if (i % 2 == 0) { + VariantMetadata metadata = Variants.metadata("data"); + ShreddedObject obj = Variants.object(metadata); + obj.put("data", Variants.of(25 + i)); + record.setField("_some_nulls_variant", Variant.of(metadata, obj)); + } else { + record.setField("_some_nulls_variant", null); + } + record.setField("_all_nulls_variant", null); - appender.add(builder.build()); + appender.add(record); } } @@ -986,7 +1024,8 @@ public void testTransformFilter() { assumeThat(format).isEqualTo(FileFormat.PARQUET); boolean shouldRead = - new ParquetMetricsRowGroupFilter(SCHEMA, equal(truncate("required", 2), "some_value"), true) + new ParquetMetricsRowGroupFilter( + testSchema, equal(truncate("required", 2), "some_value"), true) .shouldRead(parquetSchema, rowGroupMetadata); assertThat(shouldRead) .as("Should read: filter contains non-reference evaluate as True") @@ -994,94 +1033,105 @@ public void testTransformFilter() { } @TestTemplate - public void testVariantFilterNotNull() throws IOException { - assumeThat(format).isEqualTo(FileFormat.PARQUET); + public void testVariantNotNull() { + assumeThat(format).isNotEqualTo(FileFormat.ORC).as("ORC does not fully support variant type"); - Schema variantSchema = - new Schema( - required(1, "id", IntegerType.get()), - optional(2, "variant_field", Types.VariantType.get())); + boolean shouldRead = shouldRead(notNull("some_nulls_variant")); + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan") + .isTrue(); - File parquetFile = new File(tempDir, "test-variant" + System.nanoTime()); + shouldRead = shouldRead(notNull("all_nulls_variant")); + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan even for all nulls") + .isTrue(); + } - OutputFile outFile = Files.localOutput(parquetFile); - try (FileAppender appender = - Parquet.write(outFile) - .schema(variantSchema) - .createWriterFunc(GenericParquetWriter::create) - .build()) { + @TestTemplate + public void testVariantEq() { + assumeThat(format).isNotEqualTo(FileFormat.ORC).as("ORC does not fully support variant type"); - for (int i = 0; i < 10; i++) { - GenericRecord record = GenericRecord.create(variantSchema); - record.setField("id", i); + assertThatThrownBy(() -> shouldRead(equal("some_nulls_variant", "test"))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("test"); + } - if (i % 2 == 0) { - VariantMetadata metadata = Variants.metadata("field"); - ShreddedObject obj = Variants.object(metadata); - obj.put("field", Variants.of("value" + i)); - Variant variant = Variant.of(metadata, obj); - record.setField("variant_field", variant); - } + @TestTemplate + public void testVariantIn() { + assumeThat(format).isNotEqualTo(FileFormat.ORC).as("ORC does not fully support variant type"); - appender.add(record); - } - } + assertThatThrownBy(() -> shouldRead(in("some_nulls_variant", "test1", "test2"))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("test1"); - InputFile inFile = Files.localInput(parquetFile); - try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { - assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); - BlockMetaData blockMetaData = reader.getRowGroups().get(0); - MessageType fileSchema = reader.getFileMetaData().getSchema(); + assertThatThrownBy(() -> shouldRead(in("all_nulls_variant", "test1", "test2"))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("test1"); + } - ParquetMetricsRowGroupFilter rowGroupFilter = - new ParquetMetricsRowGroupFilter(variantSchema, notNull("variant_field"), true); - boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); - assertThat(shouldRead) - .as("Should read: variant notNull filters must be evaluated post scan") - .isTrue(); - } - parquetFile.deleteOnExit(); + @TestTemplate + public void testVariantNotIn() { + assumeThat(format).isNotEqualTo(FileFormat.ORC); + + // Variant columns cannot be used in 'notIn' expressions with literals + assertThatThrownBy(() -> shouldRead(notIn("some_nulls_variant", "test1", "test2"))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("test1"); + + assertThatThrownBy(() -> shouldRead(notIn("all_nulls_variant", "test1", "test2"))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("test1"); } @TestTemplate - public void testAllNullsVariantNotNull() throws IOException { - assumeThat(format).isEqualTo(FileFormat.PARQUET); + public void testVariantIsNull() { + assumeThat(format).isNotEqualTo(FileFormat.ORC).as("ORC does not fully support variant type"); + + boolean shouldRead = shouldRead(isNull("some_nulls_variant")); + assertThat(shouldRead) + .as("Should read: variant isNull filters must be evaluated post scan") + .isTrue(); - Schema variantSchema = - new Schema( - required(1, "id", IntegerType.get()), - optional(2, "variant_field", Types.VariantType.get())); + shouldRead = shouldRead(isNull("all_nulls_variant")); + assertThat(shouldRead) + .as("Should read: variant isNull filters must be evaluated post scan even for all nulls") + .isTrue(); + } - File parquetFile = new File(tempDir, "test-variant-nulls" + System.nanoTime()); + @TestTemplate + public void testVariantComparisons() { + assumeThat(format).isNotEqualTo(FileFormat.ORC).as("ORC does not fully support variant type"); - OutputFile outFile = Files.localOutput(parquetFile); - try (FileAppender appender = - Parquet.write(outFile) - .schema(variantSchema) - .createWriterFunc(GenericParquetWriter::create) - .build()) { + // Variant columns cannot be used in comparison expressions with literals + assertThatThrownBy(() -> shouldRead(lessThan("some_nulls_variant", 42))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("42"); - for (int i = 0; i < 10; i++) { - GenericRecord record = GenericRecord.create(variantSchema); - record.setField("id", i); - record.setField("variant_field", null); - appender.add(record); - } - } + assertThatThrownBy(() -> shouldRead(lessThanOrEqual("some_nulls_variant", 42))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("42"); - InputFile inFile = Files.localInput(parquetFile); - try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { - BlockMetaData blockMetaData = reader.getRowGroups().get(0); - MessageType fileSchema = reader.getFileMetaData().getSchema(); + assertThatThrownBy(() -> shouldRead(greaterThan("some_nulls_variant", 42))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("42"); - ParquetMetricsRowGroupFilter rowGroupFilter = - new ParquetMetricsRowGroupFilter(variantSchema, notNull("variant_field"), true); - boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); - assertThat(shouldRead) - .as("Should read: variant notNull filters must be evaluated post scan even for all nulls") - .isTrue(); - } - parquetFile.deleteOnExit(); + assertThatThrownBy(() -> shouldRead(greaterThanOrEqual("some_nulls_variant", 42))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("42"); + + assertThatThrownBy(() -> shouldRead(notEqual("some_nulls_variant", 42))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("42"); } private boolean shouldRead(Expression expression) { @@ -1103,8 +1153,8 @@ private boolean shouldRead(Expression expression, boolean caseSensitive) { private boolean shouldReadOrc(Expression expression, boolean caseSensitive) { try (CloseableIterable reader = ORC.read(Files.localInput(orcFile)) - .project(SCHEMA) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) + .project(testSchema) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(testSchema, fileSchema)) .filter(expression) .caseSensitive(caseSensitive) .build()) { @@ -1119,7 +1169,7 @@ private boolean shouldReadParquet( boolean caseSensitive, MessageType messageType, BlockMetaData blockMetaData) { - return new ParquetMetricsRowGroupFilter(SCHEMA, expression, caseSensitive) + return new ParquetMetricsRowGroupFilter(testSchema, expression, caseSensitive) .shouldRead(messageType, blockMetaData); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java index 40ff9048d173..598e5dd23548 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java @@ -155,10 +155,9 @@ public Boolean notNull(BoundReference ref) { // if the column has no non-null values, the expression cannot match int id = ref.fieldId(); - // When filtering nested types notNull() is implicit filter passed even though complex - // filters aren't pushed down in Parquet. Leave all nested column type filters to be - // evaluated post scan. Variant types also need to be evaluated post scan to access - // shredded statistics. + // When filtering nested types or variant types, notNull() is an implicit filter passed + // even though complex filters aren't pushed down in Parquet. Leave these type filters + // to be evaluated post scan. Type type = schema.findType(id); if (type instanceof Type.NestedType || type.isVariantType()) { return ROWS_MIGHT_MATCH; diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java index 9fc71125a92e..70286c837cdd 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java @@ -24,6 +24,7 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; import org.apache.spark.sql.Row; +import org.apache.spark.unsafe.types.VariantVal; public class SparkTestHelperBase { protected static final Object ANY = new Object(); @@ -79,6 +80,12 @@ protected void assertEquals(String context, Object[] expectedRow, Object[] actua } else { assertEquals(newContext, (Object[]) expectedValue, (Object[]) actualValue); } + } else if (expectedValue instanceof VariantVal && actualValue instanceof VariantVal) { + // Spark VariantVal may have fixed-width padding, so we compare their JSON representation + // instead of raw byte arrays. + assertThat((actualValue).toString()) + .as("%s contents should match (VariantVal JSON)", context) + .isEqualTo((expectedValue).toString()); } else if (expectedValue != ANY) { assertThat(actualValue).as("%s contents should match", context).isEqualTo(expectedValue); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java index 9d2ce2b388a2..ee45958ab47f 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java @@ -23,6 +23,8 @@ import static org.assertj.core.api.Assertions.assertThat; import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.sql.Timestamp; import java.time.Instant; import java.util.List; @@ -35,7 +37,12 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.spark.SparkCatalogConfig; import org.apache.iceberg.spark.TestBaseWithCatalog; +import org.apache.iceberg.util.ByteBuffers; +import org.apache.iceberg.variants.ShreddedObject; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.Variants; import org.apache.spark.sql.execution.SparkPlan; +import org.apache.spark.unsafe.types.VariantVal; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.TestTemplate; import org.junit.jupiter.api.extension.ExtendWith; @@ -578,6 +585,53 @@ public void testFilterPushdownWithSpecialFloatingPointPartitionValues() { ImmutableList.of(row(4, Double.NEGATIVE_INFINITY))); } + @TestTemplate + public void testVariantExtractFiltering() { + sql( + "CREATE TABLE %s (id BIGINT, data VARIANT) USING iceberg TBLPROPERTIES" + + "('format-version'='3')", + tableName); + configurePlanningMode(planningMode); + + sql( + "INSERT INTO %s VALUES " + + "(1, parse_json('{\"field\": \"foo\", \"num\": 25}')), " + + "(2, parse_json('{\"field\": \"bar\", \"num\": 30}')), " + + "(3, parse_json('{\"field\": \"baz\", \"num\": 35}')), " + + "(4, null)", + tableName); + + withDefaultTimeZone( + "UTC", + () -> { + checkFilters( + "try_variant_get(data, '$.num', 'int') > 30", + "isnotnull(data) AND (try_variant_get(data, $.num, IntegerType, false, Some(UTC)) > 30)", + "data IS NOT NULL", + ImmutableList.of(row(3L, toSparkVariantRow("baz", 35)))); + + checkFilters( + "try_variant_get(data, '$.num', 'int') = 30", + "isnotnull(data) AND (try_variant_get(data, $.num, IntegerType, false, Some(UTC)) = 30)", + "data IS NOT NULL", + ImmutableList.of(row(2L, toSparkVariantRow("bar", 30)))); + + checkFilters( + "try_variant_get(data, '$.num', 'int') IN (25, 35)", + "try_variant_get(data, $.num, IntegerType, false, Some(UTC)) IN (25,35)", + "", + ImmutableList.of( + row(1L, toSparkVariantRow("foo", 25)), row(3L, toSparkVariantRow("baz", 35)))); + + checkFilters( + "try_variant_get(data, '$.num', 'int') != 25", + "isnotnull(data) AND NOT (try_variant_get(data, $.num, IntegerType, false, Some(UTC)) = 25)", + "data IS NOT NULL", + ImmutableList.of( + row(2L, toSparkVariantRow("bar", 30)), row(3L, toSparkVariantRow("baz", 35)))); + }); + } + private void checkOnlyIcebergFilters( String predicate, String icebergFilters, List expectedRows) { @@ -600,7 +654,7 @@ private void checkFilters( if (sparkFilter != null) { assertThat(planAsString) .as("Post scan filter should match") - .contains("Filter (" + sparkFilter + ")"); + .containsAnyOf("Filter (" + sparkFilter + ")", "Filter " + sparkFilter); } else { assertThat(planAsString).as("Should be no post scan filter").doesNotContain("Filter ("); } @@ -613,4 +667,22 @@ private void checkFilters( private Timestamp timestamp(String timestampAsString) { return Timestamp.from(Instant.parse(timestampAsString)); } + + private VariantVal toSparkVariantRow(String field, int num) { + VariantMetadata metadata = Variants.metadata("field", "num"); + + ShreddedObject obj = Variants.object(metadata); + obj.put("field", Variants.of(field)); + obj.put("num", Variants.of(num)); + + ByteBuffer metadataBuffer = + ByteBuffer.allocate(metadata.sizeInBytes()).order(ByteOrder.LITTLE_ENDIAN); + metadata.writeTo(metadataBuffer, 0); + + ByteBuffer valueBuffer = ByteBuffer.allocate(obj.sizeInBytes()).order(ByteOrder.LITTLE_ENDIAN); + obj.writeTo(valueBuffer, 0); + + return new VariantVal( + ByteBuffers.toByteArray(valueBuffer), ByteBuffers.toByteArray(metadataBuffer)); + } } From 99dc168683c408d7a3b4150a44c799488d7ae2fb Mon Sep 17 00:00:00 2001 From: geruh Date: Tue, 16 Sep 2025 13:40:45 -0700 Subject: [PATCH 3/9] Refactor tests to be format specific --- .../data/MetricsRowGroupFilterTestBase.java | 619 +++++++++ .../data/TestMetricsRowGroupFilter.java | 1200 ----------------- .../orc/TestOrcMetricsRowGroupFilter.java | 99 ++ .../TestParquetMetricsRowGroupFilter.java | 444 ++++++ 4 files changed, 1162 insertions(+), 1200 deletions(-) create mode 100644 data/src/test/java/org/apache/iceberg/data/MetricsRowGroupFilterTestBase.java delete mode 100644 data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java create mode 100644 data/src/test/java/org/apache/iceberg/data/orc/TestOrcMetricsRowGroupFilter.java create mode 100644 data/src/test/java/org/apache/iceberg/data/parquet/TestParquetMetricsRowGroupFilter.java diff --git a/data/src/test/java/org/apache/iceberg/data/MetricsRowGroupFilterTestBase.java b/data/src/test/java/org/apache/iceberg/data/MetricsRowGroupFilterTestBase.java new file mode 100644 index 000000000000..3e8e7f9d66e3 --- /dev/null +++ b/data/src/test/java/org/apache/iceberg/data/MetricsRowGroupFilterTestBase.java @@ -0,0 +1,619 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.expressions.Expressions.notNaN; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.File; +import java.io.IOException; +import org.apache.iceberg.Schema; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public abstract class MetricsRowGroupFilterTestBase { + + protected static final Types.StructType STRUCT_FIELD_TYPE = + Types.StructType.of(Types.NestedField.required(7, "int_field", Types.IntegerType.get())); + + protected static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE = + Types.StructType.of(Types.NestedField.required(7, "_int_field", Types.IntegerType.get())); + + protected static final Schema BASE_SCHEMA = + new Schema( + required(1, "id", Types.IntegerType.get()), + required(2, "required", Types.StringType.get()), + optional(3, "all_nulls", Types.DoubleType.get()), + optional(4, "some_nulls", Types.StringType.get()), + optional(5, "no_nulls", Types.StringType.get()), + optional(6, "struct_not_null", STRUCT_FIELD_TYPE), + optional(8, "not_in_file", Types.FloatType.get()), + optional(9, "str", Types.StringType.get()), + optional( + 10, + "map_not_null", + Types.MapType.ofRequired(11, 12, Types.StringType.get(), Types.IntegerType.get())), + optional(13, "all_nans", Types.DoubleType.get()), + optional(14, "some_nans", Types.FloatType.get()), + optional(15, "no_nans", Types.DoubleType.get()), + optional(16, "some_double_nans", Types.DoubleType.get())); + + protected static final Schema BASE_FILE_SCHEMA = + new Schema( + Types.NestedField.required(1, "_id", Types.IntegerType.get()), + Types.NestedField.required(2, "_required", Types.StringType.get()), + Types.NestedField.optional(3, "_all_nulls", Types.DoubleType.get()), + Types.NestedField.optional(4, "_some_nulls", Types.StringType.get()), + Types.NestedField.optional(5, "_no_nulls", Types.StringType.get()), + Types.NestedField.optional(6, "_struct_not_null", UNDERSCORE_STRUCT_FIELD_TYPE), + Types.NestedField.optional(9, "_str", Types.StringType.get()), + Types.NestedField.optional(13, "_all_nans", Types.DoubleType.get()), + Types.NestedField.optional(14, "_some_nans", Types.FloatType.get()), + Types.NestedField.optional(15, "_no_nans", Types.DoubleType.get()), + Types.NestedField.optional(16, "_some_double_nans", Types.DoubleType.get())); + + protected static final int INT_MIN_VALUE = 30; + protected static final int INT_MAX_VALUE = 79; + + @TempDir protected File tempDir; + + @BeforeEach + public final void before() throws IOException { + createInputFile(); + } + + protected abstract void createInputFile() throws IOException; + + protected void populateBaseFields(GenericRecord record, int rowIndex) { + record.setField("_id", INT_MIN_VALUE + rowIndex); + record.setField("_required", "req"); + record.setField("_all_nulls", null); + record.setField("_some_nulls", (rowIndex % 10 == 0) ? null : "some"); + record.setField("_no_nulls", ""); + record.setField("_all_nans", Double.NaN); + record.setField("_some_nans", (rowIndex % 10 == 0) ? Float.NaN : 2F); + record.setField("_some_double_nans", (rowIndex % 10 == 0) ? Double.NaN : 2D); + record.setField("_no_nans", 3D); + record.setField("_str", rowIndex + "str" + rowIndex); + GenericRecord struct = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE); + struct.setField("_int_field", INT_MIN_VALUE + rowIndex); + record.setField("_struct_not_null", struct); + } + + protected boolean shouldRead(Expression expression) { + return shouldRead(expression, true); + } + + protected abstract boolean shouldRead(Expression expression, boolean caseSensitive); + + @Test + public void testAllNulls() { + boolean shouldRead; + + shouldRead = shouldRead(notNull("all_nulls")); + assertThat(shouldRead).as("Should skip: no non-null value in all null column").isFalse(); + + shouldRead = shouldRead(notNull("some_nulls")); + assertThat(shouldRead) + .as("Should read: column with some nulls contains a non-null value") + .isTrue(); + + shouldRead = shouldRead(notNull("no_nulls")); + assertThat(shouldRead).as("Should read: non-null column contains a non-null value").isTrue(); + + shouldRead = shouldRead(notNull("map_not_null")); + assertThat(shouldRead).as("Should read: map type is not skipped").isTrue(); + + shouldRead = shouldRead(notNull("struct_not_null")); + assertThat(shouldRead).as("Should read: struct type is not skipped").isTrue(); + } + + @Test + public void testNoNulls() { + boolean shouldRead = shouldRead(isNull("all_nulls")); + assertThat(shouldRead).as("Should read: at least one null value in all null column").isTrue(); + + shouldRead = shouldRead(isNull("some_nulls")); + assertThat(shouldRead).as("Should read: column with some nulls contains a null value").isTrue(); + + shouldRead = shouldRead(isNull("no_nulls")); + assertThat(shouldRead).as("Should skip: non-null column contains no null values").isFalse(); + + shouldRead = shouldRead(isNull("map_not_null")); + assertThat(shouldRead).as("Should read: map type is not skipped").isTrue(); + + shouldRead = shouldRead(isNull("struct_not_null")); + assertThat(shouldRead).as("Should read: struct type is not skipped").isTrue(); + } + + @Test + public void testFloatWithNan() { + boolean shouldRead = shouldRead(greaterThan("some_nans", 1.0)); + assertThat(shouldRead).isTrue(); + + shouldRead = shouldRead(greaterThanOrEqual("some_nans", 1.0)); + assertThat(shouldRead).isTrue(); + + shouldRead = shouldRead(lessThan("some_nans", 3.0)); + assertThat(shouldRead).isTrue(); + + shouldRead = shouldRead(lessThanOrEqual("some_nans", 1.0)); + assertThat(shouldRead).isTrue(); + + shouldRead = shouldRead(equal("some_nans", 2.0)); + assertThat(shouldRead).isTrue(); + } + + @Test + public void testDoubleWithNan() { + boolean shouldRead = shouldRead(greaterThan("some_double_nans", 1.0)); + assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); + + shouldRead = shouldRead(greaterThanOrEqual("some_double_nans", 1.0)); + assertThat(shouldRead) + .as("Should read: column with some nans contains the target value") + .isTrue(); + + shouldRead = shouldRead(lessThan("some_double_nans", 3.0)); + assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); + + shouldRead = shouldRead(lessThanOrEqual("some_double_nans", 1.0)); + assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); + + shouldRead = shouldRead(equal("some_double_nans", 2.0)); + assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); + } + + @Test + public void testIsNaN() { + boolean shouldRead = shouldRead(isNaN("all_nans")); + assertThat(shouldRead).isTrue(); + + shouldRead = shouldRead(isNaN("some_nans")); + assertThat(shouldRead).isTrue(); + + shouldRead = shouldRead(isNaN("all_nulls")); + assertThat(shouldRead).as("Should skip: all null column will not contain NaN").isFalse(); + } + + @Test + public void testNotNaN() { + boolean shouldRead = shouldRead(notNaN("all_nans")); + assertThat(shouldRead).as("Should read: NaN counts are not tracked in metrics").isTrue(); + + shouldRead = shouldRead(notNaN("some_nans")); + assertThat(shouldRead).as("Should read: NaN counts are not tracked in metrics").isTrue(); + + shouldRead = shouldRead(notNaN("no_nans")); + assertThat(shouldRead).as("Should read: NaN counts are not tracked in metrics").isTrue(); + + shouldRead = shouldRead(notNaN("all_nulls")); + assertThat(shouldRead).as("Shou ld read: NaN counts are not tracked in metrics").isTrue(); + } + + @Test + public void testRequiredColumn() { + boolean shouldRead = shouldRead(notNull("required")); + assertThat(shouldRead).as("Should read: required columns are always non-null").isTrue(); + + shouldRead = shouldRead(isNull("required")); + assertThat(shouldRead).as("Should skip: required columns are always non-null").isFalse(); + } + + @Test + public void testMissingColumn() { + assertThatThrownBy(() -> shouldRead(lessThan("missing", 5))) + .as("Should complain about missing column in expression") + .isInstanceOf(ValidationException.class) + .hasMessageStartingWith("Cannot find field 'missing'"); + } + + @Test + public void testNot() { + // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out + boolean shouldRead = shouldRead(not(lessThan("id", INT_MIN_VALUE - 25))); + assertThat(shouldRead).as("Should read: not(false)").isTrue(); + + shouldRead = shouldRead(not(greaterThan("id", INT_MIN_VALUE - 25))); + assertThat(shouldRead).as("Should skip: not(true)").isFalse(); + } + + @Test + public void testAnd() { + // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out + boolean shouldRead = + shouldRead( + and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE - 30))); + assertThat(shouldRead).as("Should skip: and(false, true)").isFalse(); + + shouldRead = + shouldRead( + and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))); + assertThat(shouldRead).as("Should skip: and(false, false)").isFalse(); + + shouldRead = + shouldRead( + and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))); + assertThat(shouldRead).as("Should read: and(true, true)").isTrue(); + } + + @Test + public void testOr() { + // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out + boolean shouldRead = + shouldRead( + or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))); + assertThat(shouldRead).as("Should skip: or(false, false)").isFalse(); + + shouldRead = + shouldRead( + or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE - 19))); + assertThat(shouldRead).as("Should read: or(false, true)").isTrue(); + } + + @Test + public void testIntegerLt() { + boolean shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); + + shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE)); + assertThat(shouldRead) + .as("Should not read: id range below lower bound (30 is not < 30)") + .isFalse(); + + shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE + 1)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(lessThan("id", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @Test + public void testIntegerLtEq() { + boolean shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); + + shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should not read: id range below lower bound (29 < 30)").isFalse(); + + shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(lessThanOrEqual("id", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: many possible ids").isTrue(); + } + + @Test + public void testIntegerGt() { + boolean shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); + + shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE)); + assertThat(shouldRead) + .as("Should not read: id range above upper bound (79 is not > 79)") + .isFalse(); + + shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE - 1)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @Test + public void testIntegerGtEq() { + boolean shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); + + shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should not read: id range above upper bound (80 > 79)").isFalse(); + + shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @Test + public void testIntegerEq() { + boolean shouldRead = shouldRead(equal("id", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); + + shouldRead = shouldRead(equal("id", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); + + shouldRead = shouldRead(equal("id", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); + + shouldRead = shouldRead(equal("id", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); + + shouldRead = shouldRead(equal("id", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); + + shouldRead = shouldRead(equal("id", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); + + shouldRead = shouldRead(equal("id", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); + } + + @Test + public void testIntegerNotEq() { + boolean shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + } + + @Test + public void testIntegerNotEqRewritten() { + boolean shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE - 25))); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE - 1))); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE))); + assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE - 4))); + assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE))); + assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE + 1))); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE + 6))); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + } + + @Test + public void testStructFieldLt() { + boolean shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); + + shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE)); + assertThat(shouldRead) + .as("Should not read: id range below lower bound (30 is not < 30)") + .isFalse(); + + shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE + 1)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @Test + public void testStructFieldLtEq() { + boolean shouldRead = + shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); + + shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should not read: id range below lower bound (29 < 30)").isFalse(); + + shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: many possible ids").isTrue(); + } + + @Test + public void testStructFieldGt() { + boolean shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); + + shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead) + .as("Should not read: id range above upper bound (79 is not > 79)") + .isFalse(); + + shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 1)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @Test + public void testStructFieldGtEq() { + boolean shouldRead = + shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); + + shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should not read: id range above upper bound (80 > 79)").isFalse(); + + shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @Test + public void testStructFieldEq() { + boolean shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); + } + + @Test + public void testStructFieldNotEq() { + boolean shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); + + shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + + shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + } + + @Test + public void testCaseInsensitive() { + boolean shouldRead = shouldRead(equal("ID", INT_MIN_VALUE - 25), false); + assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); + } + + @Test + public void testIntegerIn() { + boolean shouldRead = shouldRead(in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)); + assertThat(shouldRead).as("Should not read: id below lower bound (5 < 30, 6 < 30)").isFalse(); + + shouldRead = shouldRead(in("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should not read: id below lower bound (28 < 30, 29 < 30)").isFalse(); + + shouldRead = shouldRead(in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound (30 == 30)").isTrue(); + + shouldRead = shouldRead(in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)); + assertThat(shouldRead) + .as("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)") + .isTrue(); + + shouldRead = shouldRead(in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should read: id equal to upper bound (79 == 79)").isTrue(); + + shouldRead = shouldRead(in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)); + assertThat(shouldRead).as("Should not read: id above upper bound (80 > 79, 81 > 79)").isFalse(); + + shouldRead = shouldRead(in("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)); + assertThat(shouldRead).as("Should not read: id above upper bound (85 > 79, 86 > 79)").isFalse(); + + shouldRead = shouldRead(in("all_nulls", 1, 2)); + assertThat(shouldRead).as("Should skip: in on all nulls column").isFalse(); + + shouldRead = shouldRead(in("some_nulls", "aaa", "some")); + assertThat(shouldRead).as("Should read: in on some nulls column").isTrue(); + + shouldRead = shouldRead(in("no_nulls", "aaa", "")); + assertThat(shouldRead).as("Should read: in on no nulls column").isTrue(); + } + + @Test + public void testIntegerNotIn() { + boolean shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)); + assertThat(shouldRead).as("Should read: id below lower bound (5 < 30, 6 < 30)").isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should read: id below lower bound (28 < 30, 29 < 30)").isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound (30 == 30)").isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)); + assertThat(shouldRead) + .as("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)") + .isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should read: id equal to upper bound (79 == 79)").isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)); + assertThat(shouldRead).as("Should read: id above upper bound (80 > 79, 81 > 79)").isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)); + assertThat(shouldRead).as("Should read: id above upper bound (85 > 79, 86 > 79)").isTrue(); + + shouldRead = shouldRead(notIn("all_nulls", 1, 2)); + assertThat(shouldRead).as("Should read: notIn on all nulls column").isTrue(); + + shouldRead = shouldRead(notIn("some_nulls", "aaa", "some")); + assertThat(shouldRead).as("Should read: notIn on some nulls column").isTrue(); + } + + @Test + public void testSomeNullsNotEq() { + boolean shouldRead = shouldRead(notEqual("some_nulls", "some")); + assertThat(shouldRead).as("Should read: notEqual on some nulls column").isTrue(); + } +} diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java deleted file mode 100644 index 9b26168e4904..000000000000 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ /dev/null @@ -1,1200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.data; - -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.expressions.Expressions.notNaN; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.notStartsWith; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.expressions.Expressions.startsWith; -import static org.apache.iceberg.expressions.Expressions.truncate; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.assertj.core.api.Assumptions.assumeThat; - -import java.io.File; -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Arrays; -import java.util.List; -import java.util.UUID; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Files; -import org.apache.iceberg.Parameter; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.orc.GenericOrcReader; -import org.apache.iceberg.data.orc.GenericOrcWriter; -import org.apache.iceberg.data.parquet.GenericParquetWriter; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.io.SeekableInputStream; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.parquet.ParquetMetricsRowGroupFilter; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.types.Types.DoubleType; -import org.apache.iceberg.types.Types.FloatType; -import org.apache.iceberg.types.Types.IntegerType; -import org.apache.iceberg.types.Types.StringType; -import org.apache.iceberg.variants.ShreddedObject; -import org.apache.iceberg.variants.Variant; -import org.apache.iceberg.variants.VariantMetadata; -import org.apache.iceberg.variants.Variants; -import org.apache.orc.OrcFile; -import org.apache.orc.Reader; -import org.apache.parquet.hadoop.ParquetFileReader; -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.io.DelegatingSeekableInputStream; -import org.apache.parquet.schema.MessageType; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; -import org.junit.jupiter.api.io.TempDir; - -@ExtendWith(ParameterizedTestExtension.class) -public class TestMetricsRowGroupFilter { - - @Parameters(name = "fileFormat = {0}") - public static List parameters() { - return Arrays.asList(FileFormat.PARQUET, FileFormat.ORC); - } - - private static final Types.StructType STRUCT_FIELD_TYPE = - Types.StructType.of(Types.NestedField.required(8, "int_field", IntegerType.get())); - - private static final Schema SCHEMA = - new Schema( - required(1, "id", IntegerType.get()), - optional(2, "no_stats_parquet", StringType.get()), - required(3, "required", StringType.get()), - optional(4, "all_nulls", DoubleType.get()), - optional(5, "some_nulls", StringType.get()), - optional(6, "no_nulls", StringType.get()), - optional(7, "struct_not_null", STRUCT_FIELD_TYPE), - optional(9, "not_in_file", FloatType.get()), - optional(10, "str", StringType.get()), - optional( - 11, - "map_not_null", - Types.MapType.ofRequired(12, 13, StringType.get(), IntegerType.get())), - optional(14, "all_nans", DoubleType.get()), - optional(15, "some_nans", FloatType.get()), - optional(16, "no_nans", DoubleType.get()), - optional(17, "some_double_nans", DoubleType.get())); - - private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE = - Types.StructType.of(Types.NestedField.required(8, "_int_field", IntegerType.get())); - - private static final Schema FILE_SCHEMA = - new Schema( - required(1, "_id", IntegerType.get()), - optional(2, "_no_stats_parquet", StringType.get()), - required(3, "_required", StringType.get()), - optional(4, "_all_nulls", DoubleType.get()), - optional(5, "_some_nulls", StringType.get()), - optional(6, "_no_nulls", StringType.get()), - optional(7, "_struct_not_null", UNDERSCORE_STRUCT_FIELD_TYPE), - optional(10, "_str", StringType.get()), - optional(14, "_all_nans", Types.DoubleType.get()), - optional(15, "_some_nans", FloatType.get()), - optional(16, "_no_nans", Types.DoubleType.get()), - optional(17, "_some_double_nans", Types.DoubleType.get())); - - private static final String TOO_LONG_FOR_STATS_PARQUET; - - static { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < 200; i += 1) { - sb.append(UUID.randomUUID()); - } - TOO_LONG_FOR_STATS_PARQUET = sb.toString(); - } - - private static final int INT_MIN_VALUE = 30; - private static final int INT_MAX_VALUE = 79; - - private File orcFile = null; - private MessageType parquetSchema = null; - private BlockMetaData rowGroupMetadata = null; - private Schema testSchema; - private Schema fileSchema; - - @Parameter private FileFormat format; - @TempDir private File tempDir; - - @BeforeEach - public void createInputFile() throws IOException { - this.testSchema = schemaForFormat(format); - this.fileSchema = fileSchemaForFormat(format); - switch (format) { - case ORC: - createOrcInputFile(); - break; - case PARQUET: - createParquetInputFile(); - break; - default: - throw new UnsupportedOperationException( - "Row group filter tests not supported for " + format); - } - } - - private static Schema schemaForFormat(FileFormat format) { - if (format == FileFormat.PARQUET) { - return new Schema( - Lists.newArrayList( - Iterables.concat( - SCHEMA.columns(), - Lists.newArrayList( - optional(18, "some_nulls_variant", Types.VariantType.get()), - optional(19, "all_nulls_variant", Types.VariantType.get()))))); - } - return SCHEMA; - } - - private static Schema fileSchemaForFormat(FileFormat format) { - if (format == FileFormat.PARQUET) { - return new Schema( - Lists.newArrayList( - Iterables.concat( - FILE_SCHEMA.columns(), - Lists.newArrayList( - optional(18, "_some_nulls_variant", Types.VariantType.get()), - optional(19, "_all_nulls_variant", Types.VariantType.get()))))); - } - return FILE_SCHEMA; - } - - public void createOrcInputFile() throws IOException { - this.orcFile = new File(tempDir, "junit" + System.nanoTime()); - - OutputFile outFile = Files.localOutput(orcFile); - try (FileAppender appender = - ORC.write(outFile) - .schema(fileSchema) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { - GenericRecord record = GenericRecord.create(fileSchema); - // create 50 records - for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { - record.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 - record.setField( - "_no_stats_parquet", - TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats - // in Parquet, but will produce stats for ORC - record.setField("_required", "req"); // required, always non-null - record.setField("_all_nulls", null); // never non-null - record.setField("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values - record.setField("_no_nulls", ""); // optional, but always non-null - record.setField("_str", i + "str" + i); - record.setField("_all_nans", Double.NaN); // never non-nan - record.setField("_some_nans", (i % 10 == 0) ? Float.NaN : 2F); // includes some nan values - record.setField( - "_some_double_nans", (i % 10 == 0) ? Double.NaN : 2D); // includes some nan values - record.setField("_no_nans", 3D); // optional, but always non-nan - - GenericRecord structNotNull = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE); - structNotNull.setField("_int_field", INT_MIN_VALUE + i); - record.setField("_struct_not_null", structNotNull); // struct with int - - appender.add(record); - } - } - - InputFile inFile = Files.localInput(orcFile); - try (Reader reader = - OrcFile.createReader( - new Path(inFile.location()), OrcFile.readerOptions(new Configuration()))) { - assertThat(reader.getStripes()).as("Should create only one stripe").hasSize(1); - } - - orcFile.deleteOnExit(); - } - - private void createParquetInputFile() throws IOException { - File parquetFile = new File(tempDir, "junit" + System.nanoTime()); - - OutputFile outFile = Files.localOutput(parquetFile); - try (FileAppender appender = - Parquet.write(outFile) - .schema(fileSchema) - .createWriterFunc(GenericParquetWriter::create) - .build()) { - // create 50 records - for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { - GenericRecord record = GenericRecord.create(fileSchema); - record.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 - record.setField( - "_no_stats_parquet", - TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats - // in Parquet - record.setField("_required", "req"); // required, always non-null - record.setField("_all_nulls", null); // never non-null - record.setField("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values - record.setField("_no_nulls", ""); // optional, but always non-null - record.setField("_all_nans", Double.NaN); // never non-nan - record.setField("_some_nans", (i % 10 == 0) ? Float.NaN : 2F); // includes some nan values - record.setField( - "_some_double_nans", (i % 10 == 0) ? Double.NaN : 2D); // includes some nan values - record.setField("_no_nans", 3D); // optional, but always non-nan - record.setField("_str", i + "str" + i); - - GenericRecord structNotNull = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE); - structNotNull.setField("_int_field", INT_MIN_VALUE + i); - record.setField("_struct_not_null", structNotNull); // struct with int - - if (i % 2 == 0) { - VariantMetadata metadata = Variants.metadata("data"); - ShreddedObject obj = Variants.object(metadata); - obj.put("data", Variants.of(25 + i)); - record.setField("_some_nulls_variant", Variant.of(metadata, obj)); - } else { - record.setField("_some_nulls_variant", null); - } - record.setField("_all_nulls_variant", null); - - appender.add(record); - } - } - - InputFile inFile = Files.localInput(parquetFile); - try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { - assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); - rowGroupMetadata = reader.getRowGroups().get(0); - parquetSchema = reader.getFileMetaData().getSchema(); - } - - parquetFile.deleteOnExit(); - } - - @TestTemplate - public void testAllNulls() { - boolean shouldRead; - - shouldRead = shouldRead(notNull("all_nulls")); - assertThat(shouldRead).as("Should skip: no non-null value in all null column").isFalse(); - - shouldRead = shouldRead(notNull("some_nulls")); - assertThat(shouldRead) - .as("Should read: column with some nulls contains a non-null value") - .isTrue(); - - shouldRead = shouldRead(notNull("no_nulls")); - assertThat(shouldRead).as("Should read: non-null column contains a non-null value").isTrue(); - - shouldRead = shouldRead(notNull("map_not_null")); - assertThat(shouldRead).as("Should read: map type is not skipped").isTrue(); - - shouldRead = shouldRead(notNull("struct_not_null")); - assertThat(shouldRead).as("Should read: struct type is not skipped").isTrue(); - } - - @TestTemplate - public void testNoNulls() { - boolean shouldRead = shouldRead(isNull("all_nulls")); - assertThat(shouldRead).as("Should read: at least one null value in all null column").isTrue(); - - shouldRead = shouldRead(isNull("some_nulls")); - assertThat(shouldRead).as("Should read: column with some nulls contains a null value").isTrue(); - - shouldRead = shouldRead(isNull("no_nulls")); - assertThat(shouldRead).as("Should skip: non-null column contains no null values").isFalse(); - - shouldRead = shouldRead(isNull("map_not_null")); - assertThat(shouldRead).as("Should read: map type is not skipped").isTrue(); - - shouldRead = shouldRead(isNull("struct_not_null")); - assertThat(shouldRead).as("Should read: struct type is not skipped").isTrue(); - } - - @TestTemplate - public void testFloatWithNan() { - // NaN's should break Parquet's Min/Max stats we should be reading in all cases - boolean shouldRead = shouldRead(greaterThan("some_nans", 1.0)); - assertThat(shouldRead).isTrue(); - - shouldRead = shouldRead(greaterThanOrEqual("some_nans", 1.0)); - assertThat(shouldRead).isTrue(); - - shouldRead = shouldRead(lessThan("some_nans", 3.0)); - assertThat(shouldRead).isTrue(); - - shouldRead = shouldRead(lessThanOrEqual("some_nans", 1.0)); - assertThat(shouldRead).isTrue(); - - shouldRead = shouldRead(equal("some_nans", 2.0)); - assertThat(shouldRead).isTrue(); - } - - @TestTemplate - public void testDoubleWithNan() { - boolean shouldRead = shouldRead(greaterThan("some_double_nans", 1.0)); - assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); - - shouldRead = shouldRead(greaterThanOrEqual("some_double_nans", 1.0)); - assertThat(shouldRead) - .as("Should read: column with some nans contains the target value") - .isTrue(); - - shouldRead = shouldRead(lessThan("some_double_nans", 3.0)); - assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); - - shouldRead = shouldRead(lessThanOrEqual("some_double_nans", 1.0)); - assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); - - shouldRead = shouldRead(equal("some_double_nans", 2.0)); - assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); - } - - @TestTemplate - public void testIsNaN() { - boolean shouldRead = shouldRead(isNaN("all_nans")); - assertThat(shouldRead) - .as("Should read: NaN counts are not tracked in Parquet metrics") - .isTrue(); - - shouldRead = shouldRead(isNaN("some_nans")); - assertThat(shouldRead) - .as("Should read: NaN counts are not tracked in Parquet metrics") - .isTrue(); - - shouldRead = shouldRead(isNaN("no_nans")); - switch (format) { - case ORC: - assertThat(shouldRead) - .as("Should read 0 rows due to the ORC filter push-down feature") - .isFalse(); - break; - case PARQUET: - assertThat(shouldRead) - .as("Should read: NaN counts are not tracked in Parquet metrics") - .isTrue(); - break; - default: - throw new UnsupportedOperationException( - "Row group filter tests not supported for " + format); - } - - shouldRead = shouldRead(isNaN("all_nulls")); - assertThat(shouldRead).as("Should skip: all null column will not contain nan value").isFalse(); - } - - @TestTemplate - public void testNotNaN() { - boolean shouldRead = shouldRead(notNaN("all_nans")); - assertThat(shouldRead) - .as("Should read: NaN counts are not tracked in Parquet metrics") - .isTrue(); - - shouldRead = shouldRead(notNaN("some_nans")); - assertThat(shouldRead) - .as("Should read: NaN counts are not tracked in Parquet metrics") - .isTrue(); - - shouldRead = shouldRead(notNaN("no_nans")); - assertThat(shouldRead) - .as("Should read: NaN counts are not tracked in Parquet metrics") - .isTrue(); - - shouldRead = shouldRead(notNaN("all_nulls")); - assertThat(shouldRead) - .as("Should read: NaN counts are not tracked in Parquet metrics") - .isTrue(); - } - - @TestTemplate - public void testRequiredColumn() { - boolean shouldRead = shouldRead(notNull("required")); - assertThat(shouldRead).as("Should read: required columns are always non-null").isTrue(); - - shouldRead = shouldRead(isNull("required")); - assertThat(shouldRead).as("Should skip: required columns are always non-null").isFalse(); - } - - @TestTemplate - public void testMissingColumn() { - assertThatThrownBy(() -> shouldRead(lessThan("missing", 5))) - .as("Should complain about missing column in expression") - .isInstanceOf(ValidationException.class) - .hasMessageStartingWith("Cannot find field 'missing'"); - } - - @TestTemplate - public void testColumnNotInFile() { - assumeThat(format) - .as( - "If a column is not in file, ORC does NOT try to apply predicates assuming null values for the column") - .isNotEqualTo(FileFormat.ORC); - - Expression[] cannotMatch = - new Expression[] { - lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), - equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), - greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file") - }; - - for (Expression expr : cannotMatch) { - boolean shouldRead = shouldRead(expr); - assertThat(shouldRead) - .as("Should skip when column is not in file (all nulls): " + expr) - .isFalse(); - } - - Expression[] canMatch = new Expression[] {isNull("not_in_file"), notEqual("not_in_file", 1.0f)}; - - for (Expression expr : canMatch) { - boolean shouldRead = shouldRead(expr); - assertThat(shouldRead) - .as("Should read when column is not in file (all nulls): " + expr) - .isTrue(); - } - } - - @TestTemplate - public void testMissingStatsParquet() { - assumeThat(format).isEqualTo(FileFormat.PARQUET); - - Expression[] exprs = - new Expression[] { - lessThan("no_stats_parquet", "a"), - lessThanOrEqual("no_stats_parquet", "b"), - equal("no_stats_parquet", "c"), - greaterThan("no_stats_parquet", "d"), - greaterThanOrEqual("no_stats_parquet", "e"), - notEqual("no_stats_parquet", "f"), - isNull("no_stats_parquet"), - notNull("no_stats_parquet"), - startsWith("no_stats_parquet", "a"), - notStartsWith("no_stats_parquet", "a") - }; - - for (Expression expr : exprs) { - boolean shouldRead = shouldRead(expr); - assertThat(shouldRead).as("Should read when missing stats for expr: " + expr).isTrue(); - } - } - - @TestTemplate - public void testZeroRecordFileParquet() { - assumeThat(format).isEqualTo(FileFormat.PARQUET); - - BlockMetaData emptyBlock = new BlockMetaData(); - emptyBlock.setRowCount(0); - - Expression[] exprs = - new Expression[] { - lessThan("id", 5), - lessThanOrEqual("id", 30), - equal("id", 70), - greaterThan("id", 78), - greaterThanOrEqual("id", 90), - notEqual("id", 101), - isNull("some_nulls"), - notNull("some_nulls") - }; - - for (Expression expr : exprs) { - boolean shouldRead = shouldReadParquet(expr, true, parquetSchema, emptyBlock); - assertThat(shouldRead).as("Should never read 0-record file: " + expr).isFalse(); - } - } - - @TestTemplate - public void testNot() { - // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = shouldRead(not(lessThan("id", INT_MIN_VALUE - 25))); - assertThat(shouldRead).as("Should read: not(false)").isTrue(); - - shouldRead = shouldRead(not(greaterThan("id", INT_MIN_VALUE - 25))); - assertThat(shouldRead).as("Should skip: not(true)").isFalse(); - } - - @TestTemplate - public void testAnd() { - // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = - shouldRead( - and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE - 30))); - assertThat(shouldRead).as("Should skip: and(false, true)").isFalse(); - - shouldRead = - shouldRead( - and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))); - assertThat(shouldRead).as("Should skip: and(false, false)").isFalse(); - - shouldRead = - shouldRead( - and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))); - assertThat(shouldRead).as("Should read: and(true, true)").isTrue(); - } - - @TestTemplate - public void testOr() { - // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = - shouldRead( - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))); - assertThat(shouldRead).as("Should skip: or(false, false)").isFalse(); - - shouldRead = - shouldRead( - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE - 19))); - assertThat(shouldRead).as("Should read: or(false, true)").isTrue(); - } - - @TestTemplate - public void testIntegerLt() { - boolean shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); - - shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE)); - assertThat(shouldRead) - .as("Should not read: id range below lower bound (30 is not < 30)") - .isFalse(); - - shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE + 1)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(lessThan("id", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @TestTemplate - public void testIntegerLtEq() { - boolean shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); - - shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should not read: id range below lower bound (29 < 30)").isFalse(); - - shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(lessThanOrEqual("id", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: many possible ids").isTrue(); - } - - @TestTemplate - public void testIntegerGt() { - boolean shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); - - shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE)); - assertThat(shouldRead) - .as("Should not read: id range above upper bound (79 is not > 79)") - .isFalse(); - - shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE - 1)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @TestTemplate - public void testIntegerGtEq() { - boolean shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); - - shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should not read: id range above upper bound (80 > 79)").isFalse(); - - shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @TestTemplate - public void testIntegerEq() { - boolean shouldRead = shouldRead(equal("id", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); - - shouldRead = shouldRead(equal("id", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); - - shouldRead = shouldRead(equal("id", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); - - shouldRead = shouldRead(equal("id", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); - - shouldRead = shouldRead(equal("id", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); - - shouldRead = shouldRead(equal("id", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); - - shouldRead = shouldRead(equal("id", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); - } - - @TestTemplate - public void testIntegerNotEq() { - boolean shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - } - - @TestTemplate - public void testIntegerNotEqRewritten() { - boolean shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE - 25))); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE - 1))); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE))); - assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE - 4))); - assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE))); - assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE + 1))); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE + 6))); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - } - - @TestTemplate - public void testStructFieldLt() { - boolean shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); - - shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE)); - assertThat(shouldRead) - .as("Should not read: id range below lower bound (30 is not < 30)") - .isFalse(); - - shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE + 1)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @TestTemplate - public void testStructFieldLtEq() { - boolean shouldRead = - shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); - - shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should not read: id range below lower bound (29 < 30)").isFalse(); - - shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: many possible ids").isTrue(); - } - - @TestTemplate - public void testStructFieldGt() { - boolean shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); - - shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead) - .as("Should not read: id range above upper bound (79 is not > 79)") - .isFalse(); - - shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 1)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @TestTemplate - public void testStructFieldGtEq() { - boolean shouldRead = - shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); - - shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should not read: id range above upper bound (80 > 79)").isFalse(); - - shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @TestTemplate - public void testStructFieldEq() { - boolean shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); - } - - @TestTemplate - public void testStructFieldNotEq() { - boolean shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); - - shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - - shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - } - - @TestTemplate - public void testCaseInsensitive() { - boolean shouldRead = shouldRead(equal("ID", INT_MIN_VALUE - 25), false); - assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); - } - - @TestTemplate - public void testStringStartsWith() { - assumeThat(format) - .as("ORC row group filter does not support StringStartsWith") - .isNotEqualTo(FileFormat.ORC); - - boolean shouldRead = shouldRead(startsWith("str", "1")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(startsWith("str", "0st")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(startsWith("str", "1str1")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(startsWith("str", "1str1_xgd")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(startsWith("str", "2str")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(startsWith("str", "9xstr")); - assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); - - shouldRead = shouldRead(startsWith("str", "0S")); - assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); - - shouldRead = shouldRead(startsWith("str", "x")); - assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); - - shouldRead = shouldRead(startsWith("str", "9str9aaa")); - assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); - } - - @TestTemplate - public void testStringNotStartsWith() { - assumeThat(format) - .as("ORC row group filter does not support StringStartsWith") - .isNotEqualTo(FileFormat.ORC); - - boolean shouldRead = shouldRead(notStartsWith("str", "1")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("str", "0st")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("str", "1str1")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("str", "1str1_xgd")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("str", "2str")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("str", "9xstr")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("required", "r")); - assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); - - shouldRead = shouldRead(notStartsWith("required", "requ")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("some_nulls", "ssome")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("some_nulls", "som")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - } - - @TestTemplate - public void testIntegerIn() { - boolean shouldRead = shouldRead(in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)); - assertThat(shouldRead).as("Should not read: id below lower bound (5 < 30, 6 < 30)").isFalse(); - - shouldRead = shouldRead(in("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should not read: id below lower bound (28 < 30, 29 < 30)").isFalse(); - - shouldRead = shouldRead(in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound (30 == 30)").isTrue(); - - shouldRead = shouldRead(in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)); - assertThat(shouldRead) - .as("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)") - .isTrue(); - - shouldRead = shouldRead(in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should read: id equal to upper bound (79 == 79)").isTrue(); - - shouldRead = shouldRead(in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)); - assertThat(shouldRead).as("Should not read: id above upper bound (80 > 79, 81 > 79)").isFalse(); - - shouldRead = shouldRead(in("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)); - assertThat(shouldRead).as("Should not read: id above upper bound (85 > 79, 86 > 79)").isFalse(); - - shouldRead = shouldRead(in("all_nulls", 1, 2)); - assertThat(shouldRead).as("Should skip: in on all nulls column").isFalse(); - - shouldRead = shouldRead(in("some_nulls", "aaa", "some")); - assertThat(shouldRead).as("Should read: in on some nulls column").isTrue(); - - shouldRead = shouldRead(in("no_nulls", "aaa", "")); - assertThat(shouldRead).as("Should read: in on no nulls column").isTrue(); - } - - @TestTemplate - public void testIntegerNotIn() { - boolean shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)); - assertThat(shouldRead).as("Should read: id below lower bound (5 < 30, 6 < 30)").isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should read: id below lower bound (28 < 30, 29 < 30)").isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound (30 == 30)").isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)); - assertThat(shouldRead) - .as("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)") - .isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should read: id equal to upper bound (79 == 79)").isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)); - assertThat(shouldRead).as("Should read: id above upper bound (80 > 79, 81 > 79)").isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)); - assertThat(shouldRead).as("Should read: id above upper bound (85 > 79, 86 > 79)").isTrue(); - - shouldRead = shouldRead(notIn("all_nulls", 1, 2)); - assertThat(shouldRead).as("Should read: notIn on all nulls column").isTrue(); - - shouldRead = shouldRead(notIn("some_nulls", "aaa", "some")); - assertThat(shouldRead).as("Should read: notIn on some nulls column").isTrue(); - - shouldRead = shouldRead(notIn("no_nulls", "aaa", "")); - if (format == FileFormat.PARQUET) { - // no_nulls column has all values == "", so notIn("no_nulls", "") should always be false and - // so should be skipped - // However, the metrics evaluator in Parquets always reads row group for a notIn filter - assertThat(shouldRead).as("Should read: notIn on no nulls column").isTrue(); - } else { - assertThat(shouldRead).as("Should skip: notIn on no nulls column").isFalse(); - } - } - - @TestTemplate - public void testSomeNullsNotEq() { - boolean shouldRead = shouldRead(notEqual("some_nulls", "some")); - assertThat(shouldRead).as("Should read: notEqual on some nulls column").isTrue(); - } - - @TestTemplate - public void testInLimitParquet() { - assumeThat(format).isEqualTo(FileFormat.PARQUET); - - boolean shouldRead = shouldRead(in("id", 1, 2)); - assertThat(shouldRead).as("Should not read if IN is evaluated").isFalse(); - - List ids = Lists.newArrayListWithExpectedSize(400); - for (int id = -400; id <= 0; id++) { - ids.add(id); - } - - shouldRead = shouldRead(in("id", ids)); - assertThat(shouldRead).as("Should read if IN is not evaluated").isTrue(); - } - - @TestTemplate - public void testParquetTypePromotion() { - assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET); - - Schema promotedSchema = new Schema(required(1, "id", Types.LongType.get())); - boolean shouldRead = - new ParquetMetricsRowGroupFilter(promotedSchema, equal("id", INT_MIN_VALUE + 1), true) - .shouldRead(parquetSchema, rowGroupMetadata); - assertThat(shouldRead).as("Should succeed with promoted schema").isTrue(); - } - - @TestTemplate - public void testTransformFilter() { - assumeThat(format).isEqualTo(FileFormat.PARQUET); - - boolean shouldRead = - new ParquetMetricsRowGroupFilter( - testSchema, equal(truncate("required", 2), "some_value"), true) - .shouldRead(parquetSchema, rowGroupMetadata); - assertThat(shouldRead) - .as("Should read: filter contains non-reference evaluate as True") - .isTrue(); - } - - @TestTemplate - public void testVariantNotNull() { - assumeThat(format).isNotEqualTo(FileFormat.ORC).as("ORC does not fully support variant type"); - - boolean shouldRead = shouldRead(notNull("some_nulls_variant")); - assertThat(shouldRead) - .as("Should read: variant notNull filters must be evaluated post scan") - .isTrue(); - - shouldRead = shouldRead(notNull("all_nulls_variant")); - assertThat(shouldRead) - .as("Should read: variant notNull filters must be evaluated post scan even for all nulls") - .isTrue(); - } - - @TestTemplate - public void testVariantEq() { - assumeThat(format).isNotEqualTo(FileFormat.ORC).as("ORC does not fully support variant type"); - - assertThatThrownBy(() -> shouldRead(equal("some_nulls_variant", "test"))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("test"); - } - - @TestTemplate - public void testVariantIn() { - assumeThat(format).isNotEqualTo(FileFormat.ORC).as("ORC does not fully support variant type"); - - assertThatThrownBy(() -> shouldRead(in("some_nulls_variant", "test1", "test2"))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("test1"); - - assertThatThrownBy(() -> shouldRead(in("all_nulls_variant", "test1", "test2"))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("test1"); - } - - @TestTemplate - public void testVariantNotIn() { - assumeThat(format).isNotEqualTo(FileFormat.ORC); - - // Variant columns cannot be used in 'notIn' expressions with literals - assertThatThrownBy(() -> shouldRead(notIn("some_nulls_variant", "test1", "test2"))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("test1"); - - assertThatThrownBy(() -> shouldRead(notIn("all_nulls_variant", "test1", "test2"))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("test1"); - } - - @TestTemplate - public void testVariantIsNull() { - assumeThat(format).isNotEqualTo(FileFormat.ORC).as("ORC does not fully support variant type"); - - boolean shouldRead = shouldRead(isNull("some_nulls_variant")); - assertThat(shouldRead) - .as("Should read: variant isNull filters must be evaluated post scan") - .isTrue(); - - shouldRead = shouldRead(isNull("all_nulls_variant")); - assertThat(shouldRead) - .as("Should read: variant isNull filters must be evaluated post scan even for all nulls") - .isTrue(); - } - - @TestTemplate - public void testVariantComparisons() { - assumeThat(format).isNotEqualTo(FileFormat.ORC).as("ORC does not fully support variant type"); - - // Variant columns cannot be used in comparison expressions with literals - assertThatThrownBy(() -> shouldRead(lessThan("some_nulls_variant", 42))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("42"); - - assertThatThrownBy(() -> shouldRead(lessThanOrEqual("some_nulls_variant", 42))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("42"); - - assertThatThrownBy(() -> shouldRead(greaterThan("some_nulls_variant", 42))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("42"); - - assertThatThrownBy(() -> shouldRead(greaterThanOrEqual("some_nulls_variant", 42))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("42"); - - assertThatThrownBy(() -> shouldRead(notEqual("some_nulls_variant", 42))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("42"); - } - - private boolean shouldRead(Expression expression) { - return shouldRead(expression, true); - } - - private boolean shouldRead(Expression expression, boolean caseSensitive) { - switch (format) { - case ORC: - return shouldReadOrc(expression, caseSensitive); - case PARQUET: - return shouldReadParquet(expression, caseSensitive, parquetSchema, rowGroupMetadata); - default: - throw new UnsupportedOperationException( - "Row group filter tests not supported for " + format); - } - } - - private boolean shouldReadOrc(Expression expression, boolean caseSensitive) { - try (CloseableIterable reader = - ORC.read(Files.localInput(orcFile)) - .project(testSchema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(testSchema, fileSchema)) - .filter(expression) - .caseSensitive(caseSensitive) - .build()) { - return !Lists.newArrayList(reader).isEmpty(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - private boolean shouldReadParquet( - Expression expression, - boolean caseSensitive, - MessageType messageType, - BlockMetaData blockMetaData) { - return new ParquetMetricsRowGroupFilter(testSchema, expression, caseSensitive) - .shouldRead(messageType, blockMetaData); - } - - private org.apache.parquet.io.InputFile parquetInputFile(InputFile inFile) { - return new org.apache.parquet.io.InputFile() { - @Override - public long getLength() throws IOException { - return inFile.getLength(); - } - - @Override - public org.apache.parquet.io.SeekableInputStream newStream() throws IOException { - SeekableInputStream stream = inFile.newStream(); - return new DelegatingSeekableInputStream(stream) { - @Override - public long getPos() throws IOException { - return stream.getPos(); - } - - @Override - public void seek(long newPos) throws IOException { - stream.seek(newPos); - } - }; - } - }; - } -} diff --git a/data/src/test/java/org/apache/iceberg/data/orc/TestOrcMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/orc/TestOrcMetricsRowGroupFilter.java new file mode 100644 index 000000000000..5a2ff2fb6727 --- /dev/null +++ b/data/src/test/java/org/apache/iceberg/data/orc/TestOrcMetricsRowGroupFilter.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data.orc; + +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.io.IOException; +import java.io.UncheckedIOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.Files; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.MetricsRowGroupFilterTestBase; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.junit.jupiter.api.Test; + +public class TestOrcMetricsRowGroupFilter extends MetricsRowGroupFilterTestBase { + + private File orcFile = null; + + @Override + protected void createInputFile() throws IOException { + this.orcFile = new File(tempDir, "junit" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(orcFile); + try (FileAppender appender = + ORC.write(outFile) + .schema(BASE_FILE_SCHEMA) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { + GenericRecord record = GenericRecord.create(BASE_FILE_SCHEMA); + for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { + populateBaseFields(record, i); + appender.add(record); + } + } + InputFile inFile = Files.localInput(orcFile); + try (Reader reader = + OrcFile.createReader( + new Path(inFile.location()), OrcFile.readerOptions(new Configuration()))) { + assertThat(reader.getStripes()).as("Should create only one stripe").hasSize(1); + } + orcFile.deleteOnExit(); + } + + @Override + protected boolean shouldRead(Expression expression, boolean caseSensitive) { + try (CloseableIterable reader = + ORC.read(Files.localInput(orcFile)) + .project(BASE_SCHEMA) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(BASE_SCHEMA, fileSchema)) + .filter(expression) + .caseSensitive(caseSensitive) + .build()) { + return !Lists.newArrayList(reader).isEmpty(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Test + public void testIsNaNNoNans() { + boolean shouldRead = shouldRead(isNaN("no_nans")); + assertThat(shouldRead).as("ORC filter push-down can skip groups with no NaN").isFalse(); + } + + @Test + public void testNotInNoNulls() { + boolean shouldRead = shouldRead(notIn("no_nulls", "aaa", "")); + assertThat(shouldRead).as("ORC can skip when all values match excluded set").isFalse(); + } +} diff --git a/data/src/test/java/org/apache/iceberg/data/parquet/TestParquetMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/parquet/TestParquetMetricsRowGroupFilter.java new file mode 100644 index 000000000000..94d098f10a25 --- /dev/null +++ b/data/src/test/java/org/apache/iceberg/data/parquet/TestParquetMetricsRowGroupFilter.java @@ -0,0 +1,444 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data.parquet; + +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; +import static org.apache.iceberg.expressions.Expressions.startsWith; +import static org.apache.iceberg.expressions.Expressions.truncate; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.UUID; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.MetricsRowGroupFilterTestBase; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.io.SeekableInputStream; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.parquet.ParquetMetricsRowGroupFilter; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.StringType; +import org.apache.iceberg.variants.ShreddedObject; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.Variants; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.io.DelegatingSeekableInputStream; +import org.apache.parquet.schema.MessageType; +import org.junit.jupiter.api.Test; + +public class TestParquetMetricsRowGroupFilter extends MetricsRowGroupFilterTestBase { + + private static final Schema PARQUET_SCHEMA = + new Schema( + Lists.newArrayList( + Iterables.concat( + BASE_SCHEMA.columns(), + Lists.newArrayList( + optional(17, "no_stats_parquet", StringType.get()), + optional(18, "some_nulls_variant", Types.VariantType.get()), + optional(19, "all_nulls_variant", Types.VariantType.get()))))); + + private static final Schema PARQUET_FILE_SCHEMA = + new Schema( + Lists.newArrayList( + Iterables.concat( + BASE_FILE_SCHEMA.columns(), + Lists.newArrayList( + optional(17, "_no_stats_parquet", StringType.get()), + optional(18, "_some_nulls_variant", Types.VariantType.get()), + optional(19, "_all_nulls_variant", Types.VariantType.get()))))); + + private MessageType parquetSchema = null; + private BlockMetaData rowGroupMetadata = null; + + @Override + protected void createInputFile() throws IOException { + File parquetFile = new File(tempDir, "junit" + System.nanoTime()); + + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 200; i += 1) { + sb.append(UUID.randomUUID()); + } + String tooLongForParquetValue = sb.toString(); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender appender = + Parquet.write(outFile) + .schema(PARQUET_FILE_SCHEMA) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { + GenericRecord record = GenericRecord.create(PARQUET_FILE_SCHEMA); + populateBaseFields(record, i); + record.setField( + "_no_stats_parquet", + tooLongForParquetValue); // value longer than 4k will produce no stats + if (i % 2 == 0) { + VariantMetadata metadata = Variants.metadata("data"); + ShreddedObject obj = Variants.object(metadata); + obj.put("data", Variants.of(25 + i)); + record.setField("_some_nulls_variant", Variant.of(metadata, obj)); + } else { + record.setField("_some_nulls_variant", null); + } + record.setField("_all_nulls_variant", null); + appender.add(record); + } + } + + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); + rowGroupMetadata = reader.getRowGroups().get(0); + parquetSchema = reader.getFileMetaData().getSchema(); + } + parquetFile.deleteOnExit(); + } + + @Override + protected boolean shouldRead(Expression expression, boolean caseSensitive) { + return shouldRead(expression, caseSensitive, rowGroupMetadata); + } + + @Test + public void testIsNaNNoNans() { + boolean shouldRead = shouldRead(isNaN("no_nans")); + assertThat(shouldRead).as("Parquet metrics don't track NaN counts").isTrue(); + } + + @Test + public void testNotInNoNulls() { + boolean shouldRead = shouldRead(notIn("no_nulls", "aaa", "")); + assertThat(shouldRead).as("Parquet metrics evaluator reads for notIn").isTrue(); + } + + @Test + public void testColumnNotInFile() { + Expression[] cannotMatch = + new Expression[] { + lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), + equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), + greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file") + }; + + for (Expression expr : cannotMatch) { + boolean shouldRead = shouldRead(expr); + assertThat(shouldRead) + .as("Should skip when column is not in file (all nulls): " + expr) + .isFalse(); + } + + Expression[] canMatch = new Expression[] {isNull("not_in_file"), notEqual("not_in_file", 1.0f)}; + + for (Expression expr : canMatch) { + boolean shouldRead = shouldRead(expr); + assertThat(shouldRead) + .as("Should read when column is not in file (all nulls): " + expr) + .isTrue(); + } + } + + @Test + public void testMissingStatsParquet() { + Expression[] exprs = + new Expression[] { + lessThan("no_stats_parquet", "a"), + lessThanOrEqual("no_stats_parquet", "b"), + equal("no_stats_parquet", "c"), + greaterThan("no_stats_parquet", "d"), + greaterThanOrEqual("no_stats_parquet", "e"), + notEqual("no_stats_parquet", "f"), + isNull("no_stats_parquet"), + notNull("no_stats_parquet"), + startsWith("no_stats_parquet", "a"), + notStartsWith("no_stats_parquet", "a") + }; + + for (Expression expr : exprs) { + boolean shouldRead = shouldRead(expr); + assertThat(shouldRead).as("Should read when missing stats for expr: " + expr).isTrue(); + } + } + + @Test + public void testZeroRecordFileParquet() { + BlockMetaData emptyBlock = new BlockMetaData(); + emptyBlock.setRowCount(0); + + Expression[] exprs = + new Expression[] { + lessThan("id", 5), + lessThanOrEqual("id", 30), + equal("id", 70), + greaterThan("id", 78), + greaterThanOrEqual("id", 90), + notEqual("id", 101), + isNull("some_nulls"), + notNull("some_nulls") + }; + + for (Expression expr : exprs) { + boolean shouldRead = shouldRead(expr, true, emptyBlock); + assertThat(shouldRead).as("Should never read 0-record file: " + expr).isFalse(); + } + } + + @Test + public void testStringStartsWith() { + boolean shouldRead = shouldRead(startsWith("str", "1")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(startsWith("str", "0st")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(startsWith("str", "1str1")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(startsWith("str", "1str1_xgd")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(startsWith("str", "2str")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(startsWith("str", "9xstr")); + assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); + + shouldRead = shouldRead(startsWith("str", "0S")); + assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); + + shouldRead = shouldRead(startsWith("str", "x")); + assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); + + shouldRead = shouldRead(startsWith("str", "9str9aaa")); + assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); + } + + @Test + public void testStringNotStartsWith() { + boolean shouldRead = shouldRead(notStartsWith("str", "1")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("str", "0st")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("str", "1str1")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("str", "1str1_xgd")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("str", "2str")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("str", "9xstr")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("required", "r")); + assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); + + shouldRead = shouldRead(notStartsWith("required", "requ")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("some_nulls", "ssome")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("some_nulls", "som")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + } + + @Test + public void testInLimitParquet() { + boolean shouldRead = shouldRead(in("id", 1, 2)); + assertThat(shouldRead).as("Should not read if IN is evaluated").isFalse(); + + List ids = Lists.newArrayListWithExpectedSize(400); + for (int id = -400; id <= 0; id++) { + ids.add(id); + } + + shouldRead = shouldRead(in("id", ids)); + assertThat(shouldRead).as("Should read if IN is not evaluated").isTrue(); + } + + @Test + public void testParquetTypePromotion() { + Schema promotedSchema = new Schema(Types.NestedField.required(1, "id", Types.LongType.get())); + boolean shouldRead = + new ParquetMetricsRowGroupFilter(promotedSchema, equal("id", INT_MIN_VALUE + 1), true) + .shouldRead(parquetSchema, rowGroupMetadata); + assertThat(shouldRead).as("Should succeed with promoted schema").isTrue(); + } + + @Test + public void testTransformFilter() { + boolean shouldRead = + new ParquetMetricsRowGroupFilter( + PARQUET_SCHEMA, equal(truncate("required", 2), "some_value"), true) + .shouldRead(parquetSchema, rowGroupMetadata); + assertThat(shouldRead) + .as("Should read: filter contains non-reference evaluate as True") + .isTrue(); + } + + @Test + public void testVariantNotNull() { + boolean shouldRead = shouldRead(notNull("some_nulls_variant")); + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan") + .isTrue(); + + shouldRead = shouldRead(notNull("all_nulls_variant")); + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan even for all nulls") + .isTrue(); + } + + @Test + public void testVariantEq() { + assertThatThrownBy(() -> shouldRead(equal("some_nulls_variant", "test"))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("test"); + } + + @Test + public void testVariantIn() { + assertThatThrownBy(() -> shouldRead(in("some_nulls_variant", "test1", "test2"))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("test1"); + + assertThatThrownBy(() -> shouldRead(in("all_nulls_variant", "test1", "test2"))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("test1"); + } + + @Test + public void testVariantNotIn() { + assertThatThrownBy(() -> shouldRead(notIn("some_nulls_variant", "test1", "test2"))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("test1"); + + assertThatThrownBy(() -> shouldRead(notIn("all_nulls_variant", "test1", "test2"))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("test1"); + } + + @Test + public void testVariantIsNull() { + boolean shouldRead = shouldRead(isNull("some_nulls_variant")); + assertThat(shouldRead) + .as("Should read: variant isNull filters must be evaluated post scan") + .isTrue(); + + shouldRead = shouldRead(isNull("all_nulls_variant")); + assertThat(shouldRead) + .as("Should read: variant isNull filters must be evaluated post scan even for all nulls") + .isTrue(); + } + + @Test + public void testVariantComparisons() { + assertThatThrownBy(() -> shouldRead(lessThan("some_nulls_variant", 42))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("42"); + + assertThatThrownBy(() -> shouldRead(lessThanOrEqual("some_nulls_variant", 42))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("42"); + + assertThatThrownBy(() -> shouldRead(greaterThan("some_nulls_variant", 42))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("42"); + + assertThatThrownBy(() -> shouldRead(greaterThanOrEqual("some_nulls_variant", 42))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("42"); + + assertThatThrownBy(() -> shouldRead(notEqual("some_nulls_variant", 42))) + .isInstanceOf(ValidationException.class) + .hasMessageContaining("Invalid value for conversion to type variant:") + .hasMessageContaining("42"); + } + + private boolean shouldRead(Expression expression, boolean caseSensitive, BlockMetaData block) { + if (parquetSchema == null || block == null) { + throw new IllegalStateException( + "Parquet file not created yet. Call createInputFile() first."); + } + return new ParquetMetricsRowGroupFilter(PARQUET_SCHEMA, expression, caseSensitive) + .shouldRead(parquetSchema, block); + } + + private org.apache.parquet.io.InputFile parquetInputFile(InputFile inFile) { + return new org.apache.parquet.io.InputFile() { + @Override + public long getLength() throws IOException { + return inFile.getLength(); + } + + @Override + public org.apache.parquet.io.SeekableInputStream newStream() throws IOException { + SeekableInputStream stream = inFile.newStream(); + return new DelegatingSeekableInputStream(stream) { + @Override + public long getPos() throws IOException { + return stream.getPos(); + } + + @Override + public void seek(long newPos) throws IOException { + stream.seek(newPos); + } + }; + } + }; + } +} From ab202af7eccb3d495dd362ad8da51b3269f4a2cc Mon Sep 17 00:00:00 2001 From: geruh Date: Wed, 17 Sep 2025 13:19:56 -0700 Subject: [PATCH 4/9] Revert refactoring --- .../data/MetricsRowGroupFilterTestBase.java | 619 --------- .../data/TestMetricsRowGroupFilter.java | 1150 +++++++++++++++++ .../orc/TestOrcMetricsRowGroupFilter.java | 99 -- .../TestParquetMetricsRowGroupFilter.java | 444 ------- 4 files changed, 1150 insertions(+), 1162 deletions(-) delete mode 100644 data/src/test/java/org/apache/iceberg/data/MetricsRowGroupFilterTestBase.java create mode 100644 data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java delete mode 100644 data/src/test/java/org/apache/iceberg/data/orc/TestOrcMetricsRowGroupFilter.java delete mode 100644 data/src/test/java/org/apache/iceberg/data/parquet/TestParquetMetricsRowGroupFilter.java diff --git a/data/src/test/java/org/apache/iceberg/data/MetricsRowGroupFilterTestBase.java b/data/src/test/java/org/apache/iceberg/data/MetricsRowGroupFilterTestBase.java deleted file mode 100644 index 3e8e7f9d66e3..000000000000 --- a/data/src/test/java/org/apache/iceberg/data/MetricsRowGroupFilterTestBase.java +++ /dev/null @@ -1,619 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.data; - -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.expressions.Expressions.notNaN; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.File; -import java.io.IOException; -import org.apache.iceberg.Schema; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; - -public abstract class MetricsRowGroupFilterTestBase { - - protected static final Types.StructType STRUCT_FIELD_TYPE = - Types.StructType.of(Types.NestedField.required(7, "int_field", Types.IntegerType.get())); - - protected static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE = - Types.StructType.of(Types.NestedField.required(7, "_int_field", Types.IntegerType.get())); - - protected static final Schema BASE_SCHEMA = - new Schema( - required(1, "id", Types.IntegerType.get()), - required(2, "required", Types.StringType.get()), - optional(3, "all_nulls", Types.DoubleType.get()), - optional(4, "some_nulls", Types.StringType.get()), - optional(5, "no_nulls", Types.StringType.get()), - optional(6, "struct_not_null", STRUCT_FIELD_TYPE), - optional(8, "not_in_file", Types.FloatType.get()), - optional(9, "str", Types.StringType.get()), - optional( - 10, - "map_not_null", - Types.MapType.ofRequired(11, 12, Types.StringType.get(), Types.IntegerType.get())), - optional(13, "all_nans", Types.DoubleType.get()), - optional(14, "some_nans", Types.FloatType.get()), - optional(15, "no_nans", Types.DoubleType.get()), - optional(16, "some_double_nans", Types.DoubleType.get())); - - protected static final Schema BASE_FILE_SCHEMA = - new Schema( - Types.NestedField.required(1, "_id", Types.IntegerType.get()), - Types.NestedField.required(2, "_required", Types.StringType.get()), - Types.NestedField.optional(3, "_all_nulls", Types.DoubleType.get()), - Types.NestedField.optional(4, "_some_nulls", Types.StringType.get()), - Types.NestedField.optional(5, "_no_nulls", Types.StringType.get()), - Types.NestedField.optional(6, "_struct_not_null", UNDERSCORE_STRUCT_FIELD_TYPE), - Types.NestedField.optional(9, "_str", Types.StringType.get()), - Types.NestedField.optional(13, "_all_nans", Types.DoubleType.get()), - Types.NestedField.optional(14, "_some_nans", Types.FloatType.get()), - Types.NestedField.optional(15, "_no_nans", Types.DoubleType.get()), - Types.NestedField.optional(16, "_some_double_nans", Types.DoubleType.get())); - - protected static final int INT_MIN_VALUE = 30; - protected static final int INT_MAX_VALUE = 79; - - @TempDir protected File tempDir; - - @BeforeEach - public final void before() throws IOException { - createInputFile(); - } - - protected abstract void createInputFile() throws IOException; - - protected void populateBaseFields(GenericRecord record, int rowIndex) { - record.setField("_id", INT_MIN_VALUE + rowIndex); - record.setField("_required", "req"); - record.setField("_all_nulls", null); - record.setField("_some_nulls", (rowIndex % 10 == 0) ? null : "some"); - record.setField("_no_nulls", ""); - record.setField("_all_nans", Double.NaN); - record.setField("_some_nans", (rowIndex % 10 == 0) ? Float.NaN : 2F); - record.setField("_some_double_nans", (rowIndex % 10 == 0) ? Double.NaN : 2D); - record.setField("_no_nans", 3D); - record.setField("_str", rowIndex + "str" + rowIndex); - GenericRecord struct = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE); - struct.setField("_int_field", INT_MIN_VALUE + rowIndex); - record.setField("_struct_not_null", struct); - } - - protected boolean shouldRead(Expression expression) { - return shouldRead(expression, true); - } - - protected abstract boolean shouldRead(Expression expression, boolean caseSensitive); - - @Test - public void testAllNulls() { - boolean shouldRead; - - shouldRead = shouldRead(notNull("all_nulls")); - assertThat(shouldRead).as("Should skip: no non-null value in all null column").isFalse(); - - shouldRead = shouldRead(notNull("some_nulls")); - assertThat(shouldRead) - .as("Should read: column with some nulls contains a non-null value") - .isTrue(); - - shouldRead = shouldRead(notNull("no_nulls")); - assertThat(shouldRead).as("Should read: non-null column contains a non-null value").isTrue(); - - shouldRead = shouldRead(notNull("map_not_null")); - assertThat(shouldRead).as("Should read: map type is not skipped").isTrue(); - - shouldRead = shouldRead(notNull("struct_not_null")); - assertThat(shouldRead).as("Should read: struct type is not skipped").isTrue(); - } - - @Test - public void testNoNulls() { - boolean shouldRead = shouldRead(isNull("all_nulls")); - assertThat(shouldRead).as("Should read: at least one null value in all null column").isTrue(); - - shouldRead = shouldRead(isNull("some_nulls")); - assertThat(shouldRead).as("Should read: column with some nulls contains a null value").isTrue(); - - shouldRead = shouldRead(isNull("no_nulls")); - assertThat(shouldRead).as("Should skip: non-null column contains no null values").isFalse(); - - shouldRead = shouldRead(isNull("map_not_null")); - assertThat(shouldRead).as("Should read: map type is not skipped").isTrue(); - - shouldRead = shouldRead(isNull("struct_not_null")); - assertThat(shouldRead).as("Should read: struct type is not skipped").isTrue(); - } - - @Test - public void testFloatWithNan() { - boolean shouldRead = shouldRead(greaterThan("some_nans", 1.0)); - assertThat(shouldRead).isTrue(); - - shouldRead = shouldRead(greaterThanOrEqual("some_nans", 1.0)); - assertThat(shouldRead).isTrue(); - - shouldRead = shouldRead(lessThan("some_nans", 3.0)); - assertThat(shouldRead).isTrue(); - - shouldRead = shouldRead(lessThanOrEqual("some_nans", 1.0)); - assertThat(shouldRead).isTrue(); - - shouldRead = shouldRead(equal("some_nans", 2.0)); - assertThat(shouldRead).isTrue(); - } - - @Test - public void testDoubleWithNan() { - boolean shouldRead = shouldRead(greaterThan("some_double_nans", 1.0)); - assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); - - shouldRead = shouldRead(greaterThanOrEqual("some_double_nans", 1.0)); - assertThat(shouldRead) - .as("Should read: column with some nans contains the target value") - .isTrue(); - - shouldRead = shouldRead(lessThan("some_double_nans", 3.0)); - assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); - - shouldRead = shouldRead(lessThanOrEqual("some_double_nans", 1.0)); - assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); - - shouldRead = shouldRead(equal("some_double_nans", 2.0)); - assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); - } - - @Test - public void testIsNaN() { - boolean shouldRead = shouldRead(isNaN("all_nans")); - assertThat(shouldRead).isTrue(); - - shouldRead = shouldRead(isNaN("some_nans")); - assertThat(shouldRead).isTrue(); - - shouldRead = shouldRead(isNaN("all_nulls")); - assertThat(shouldRead).as("Should skip: all null column will not contain NaN").isFalse(); - } - - @Test - public void testNotNaN() { - boolean shouldRead = shouldRead(notNaN("all_nans")); - assertThat(shouldRead).as("Should read: NaN counts are not tracked in metrics").isTrue(); - - shouldRead = shouldRead(notNaN("some_nans")); - assertThat(shouldRead).as("Should read: NaN counts are not tracked in metrics").isTrue(); - - shouldRead = shouldRead(notNaN("no_nans")); - assertThat(shouldRead).as("Should read: NaN counts are not tracked in metrics").isTrue(); - - shouldRead = shouldRead(notNaN("all_nulls")); - assertThat(shouldRead).as("Shou ld read: NaN counts are not tracked in metrics").isTrue(); - } - - @Test - public void testRequiredColumn() { - boolean shouldRead = shouldRead(notNull("required")); - assertThat(shouldRead).as("Should read: required columns are always non-null").isTrue(); - - shouldRead = shouldRead(isNull("required")); - assertThat(shouldRead).as("Should skip: required columns are always non-null").isFalse(); - } - - @Test - public void testMissingColumn() { - assertThatThrownBy(() -> shouldRead(lessThan("missing", 5))) - .as("Should complain about missing column in expression") - .isInstanceOf(ValidationException.class) - .hasMessageStartingWith("Cannot find field 'missing'"); - } - - @Test - public void testNot() { - // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = shouldRead(not(lessThan("id", INT_MIN_VALUE - 25))); - assertThat(shouldRead).as("Should read: not(false)").isTrue(); - - shouldRead = shouldRead(not(greaterThan("id", INT_MIN_VALUE - 25))); - assertThat(shouldRead).as("Should skip: not(true)").isFalse(); - } - - @Test - public void testAnd() { - // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = - shouldRead( - and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE - 30))); - assertThat(shouldRead).as("Should skip: and(false, true)").isFalse(); - - shouldRead = - shouldRead( - and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))); - assertThat(shouldRead).as("Should skip: and(false, false)").isFalse(); - - shouldRead = - shouldRead( - and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))); - assertThat(shouldRead).as("Should read: and(true, true)").isTrue(); - } - - @Test - public void testOr() { - // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out - boolean shouldRead = - shouldRead( - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))); - assertThat(shouldRead).as("Should skip: or(false, false)").isFalse(); - - shouldRead = - shouldRead( - or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE - 19))); - assertThat(shouldRead).as("Should read: or(false, true)").isTrue(); - } - - @Test - public void testIntegerLt() { - boolean shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); - - shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE)); - assertThat(shouldRead) - .as("Should not read: id range below lower bound (30 is not < 30)") - .isFalse(); - - shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE + 1)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(lessThan("id", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @Test - public void testIntegerLtEq() { - boolean shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); - - shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should not read: id range below lower bound (29 < 30)").isFalse(); - - shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(lessThanOrEqual("id", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: many possible ids").isTrue(); - } - - @Test - public void testIntegerGt() { - boolean shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); - - shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE)); - assertThat(shouldRead) - .as("Should not read: id range above upper bound (79 is not > 79)") - .isFalse(); - - shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE - 1)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @Test - public void testIntegerGtEq() { - boolean shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); - - shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should not read: id range above upper bound (80 > 79)").isFalse(); - - shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @Test - public void testIntegerEq() { - boolean shouldRead = shouldRead(equal("id", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); - - shouldRead = shouldRead(equal("id", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); - - shouldRead = shouldRead(equal("id", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); - - shouldRead = shouldRead(equal("id", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); - - shouldRead = shouldRead(equal("id", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); - - shouldRead = shouldRead(equal("id", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); - - shouldRead = shouldRead(equal("id", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); - } - - @Test - public void testIntegerNotEq() { - boolean shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - } - - @Test - public void testIntegerNotEqRewritten() { - boolean shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE - 25))); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE - 1))); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE))); - assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE - 4))); - assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE))); - assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE + 1))); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - - shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE + 6))); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - } - - @Test - public void testStructFieldLt() { - boolean shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); - - shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE)); - assertThat(shouldRead) - .as("Should not read: id range below lower bound (30 is not < 30)") - .isFalse(); - - shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE + 1)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @Test - public void testStructFieldLtEq() { - boolean shouldRead = - shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); - - shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should not read: id range below lower bound (29 < 30)").isFalse(); - - shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: many possible ids").isTrue(); - } - - @Test - public void testStructFieldGt() { - boolean shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); - - shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead) - .as("Should not read: id range above upper bound (79 is not > 79)") - .isFalse(); - - shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 1)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @Test - public void testStructFieldGtEq() { - boolean shouldRead = - shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); - - shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should not read: id range above upper bound (80 > 79)").isFalse(); - - shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: one possible id").isTrue(); - - shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: may possible ids").isTrue(); - } - - @Test - public void testStructFieldEq() { - boolean shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); - - shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); - } - - @Test - public void testStructFieldNotEq() { - boolean shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); - - shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)); - assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); - - shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE)); - assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); - - shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - - shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)); - assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); - } - - @Test - public void testCaseInsensitive() { - boolean shouldRead = shouldRead(equal("ID", INT_MIN_VALUE - 25), false); - assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); - } - - @Test - public void testIntegerIn() { - boolean shouldRead = shouldRead(in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)); - assertThat(shouldRead).as("Should not read: id below lower bound (5 < 30, 6 < 30)").isFalse(); - - shouldRead = shouldRead(in("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should not read: id below lower bound (28 < 30, 29 < 30)").isFalse(); - - shouldRead = shouldRead(in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound (30 == 30)").isTrue(); - - shouldRead = shouldRead(in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)); - assertThat(shouldRead) - .as("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)") - .isTrue(); - - shouldRead = shouldRead(in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should read: id equal to upper bound (79 == 79)").isTrue(); - - shouldRead = shouldRead(in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)); - assertThat(shouldRead).as("Should not read: id above upper bound (80 > 79, 81 > 79)").isFalse(); - - shouldRead = shouldRead(in("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)); - assertThat(shouldRead).as("Should not read: id above upper bound (85 > 79, 86 > 79)").isFalse(); - - shouldRead = shouldRead(in("all_nulls", 1, 2)); - assertThat(shouldRead).as("Should skip: in on all nulls column").isFalse(); - - shouldRead = shouldRead(in("some_nulls", "aaa", "some")); - assertThat(shouldRead).as("Should read: in on some nulls column").isTrue(); - - shouldRead = shouldRead(in("no_nulls", "aaa", "")); - assertThat(shouldRead).as("Should read: in on no nulls column").isTrue(); - } - - @Test - public void testIntegerNotIn() { - boolean shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)); - assertThat(shouldRead).as("Should read: id below lower bound (5 < 30, 6 < 30)").isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)); - assertThat(shouldRead).as("Should read: id below lower bound (28 < 30, 29 < 30)").isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)); - assertThat(shouldRead).as("Should read: id equal to lower bound (30 == 30)").isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)); - assertThat(shouldRead) - .as("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)") - .isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)); - assertThat(shouldRead).as("Should read: id equal to upper bound (79 == 79)").isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)); - assertThat(shouldRead).as("Should read: id above upper bound (80 > 79, 81 > 79)").isTrue(); - - shouldRead = shouldRead(notIn("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)); - assertThat(shouldRead).as("Should read: id above upper bound (85 > 79, 86 > 79)").isTrue(); - - shouldRead = shouldRead(notIn("all_nulls", 1, 2)); - assertThat(shouldRead).as("Should read: notIn on all nulls column").isTrue(); - - shouldRead = shouldRead(notIn("some_nulls", "aaa", "some")); - assertThat(shouldRead).as("Should read: notIn on some nulls column").isTrue(); - } - - @Test - public void testSomeNullsNotEq() { - boolean shouldRead = shouldRead(notEqual("some_nulls", "some")); - assertThat(shouldRead).as("Should read: notEqual on some nulls column").isTrue(); - } -} diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java new file mode 100644 index 000000000000..a475fecf004b --- /dev/null +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -0,0 +1,1150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import static org.apache.iceberg.avro.AvroSchemaUtil.convert; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.expressions.Expressions.notNaN; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; +import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; +import static org.apache.iceberg.expressions.Expressions.truncate; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; + +import java.io.File; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Arrays; +import java.util.List; +import java.util.UUID; +import org.apache.avro.generic.GenericData.Record; +import org.apache.avro.generic.GenericRecordBuilder; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Files; +import org.apache.iceberg.Parameter; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.data.orc.GenericOrcReader; +import org.apache.iceberg.data.orc.GenericOrcWriter; +import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.io.SeekableInputStream; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.parquet.ParquetMetricsRowGroupFilter; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.DoubleType; +import org.apache.iceberg.types.Types.FloatType; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.StringType; +import org.apache.iceberg.variants.ShreddedObject; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.Variants; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.io.DelegatingSeekableInputStream; +import org.apache.parquet.schema.MessageType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestMetricsRowGroupFilter { + + @Parameters(name = "fileFormat = {0}") + public static List parameters() { + return Arrays.asList(FileFormat.PARQUET, FileFormat.ORC); + } + + private static final Types.StructType STRUCT_FIELD_TYPE = + Types.StructType.of(Types.NestedField.required(8, "int_field", IntegerType.get())); + + private static final Schema SCHEMA = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "no_stats_parquet", StringType.get()), + required(3, "required", StringType.get()), + optional(4, "all_nulls", DoubleType.get()), + optional(5, "some_nulls", StringType.get()), + optional(6, "no_nulls", StringType.get()), + optional(7, "struct_not_null", STRUCT_FIELD_TYPE), + optional(9, "not_in_file", FloatType.get()), + optional(10, "str", StringType.get()), + optional( + 11, + "map_not_null", + Types.MapType.ofRequired(12, 13, StringType.get(), IntegerType.get())), + optional(14, "all_nans", DoubleType.get()), + optional(15, "some_nans", FloatType.get()), + optional(16, "no_nans", DoubleType.get()), + optional(17, "some_double_nans", DoubleType.get())); + + private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE = + Types.StructType.of(Types.NestedField.required(8, "_int_field", IntegerType.get())); + + private static final Schema FILE_SCHEMA = + new Schema( + required(1, "_id", IntegerType.get()), + optional(2, "_no_stats_parquet", StringType.get()), + required(3, "_required", StringType.get()), + optional(4, "_all_nulls", DoubleType.get()), + optional(5, "_some_nulls", StringType.get()), + optional(6, "_no_nulls", StringType.get()), + optional(7, "_struct_not_null", UNDERSCORE_STRUCT_FIELD_TYPE), + optional(10, "_str", StringType.get()), + optional(14, "_all_nans", Types.DoubleType.get()), + optional(15, "_some_nans", FloatType.get()), + optional(16, "_no_nans", Types.DoubleType.get()), + optional(17, "_some_double_nans", Types.DoubleType.get())); + + private static final String TOO_LONG_FOR_STATS_PARQUET; + + static { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 200; i += 1) { + sb.append(UUID.randomUUID()); + } + TOO_LONG_FOR_STATS_PARQUET = sb.toString(); + } + + private static final int INT_MIN_VALUE = 30; + private static final int INT_MAX_VALUE = 79; + + private File orcFile = null; + private MessageType parquetSchema = null; + private BlockMetaData rowGroupMetadata = null; + + @Parameter private FileFormat format; + @TempDir private File tempDir; + + @BeforeEach + public void createInputFile() throws IOException { + switch (format) { + case ORC: + createOrcInputFile(); + break; + case PARQUET: + createParquetInputFile(); + break; + default: + throw new UnsupportedOperationException( + "Row group filter tests not supported for " + format); + } + } + + public void createOrcInputFile() throws IOException { + this.orcFile = new File(tempDir, "junit" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(orcFile); + try (FileAppender appender = + ORC.write(outFile) + .schema(FILE_SCHEMA) + .createWriterFunc(GenericOrcWriter::buildWriter) + .build()) { + GenericRecord record = GenericRecord.create(FILE_SCHEMA); + // create 50 records + for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { + record.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 + record.setField( + "_no_stats_parquet", + TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats + // in Parquet, but will produce stats for ORC + record.setField("_required", "req"); // required, always non-null + record.setField("_all_nulls", null); // never non-null + record.setField("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values + record.setField("_no_nulls", ""); // optional, but always non-null + record.setField("_str", i + "str" + i); + record.setField("_all_nans", Double.NaN); // never non-nan + record.setField("_some_nans", (i % 10 == 0) ? Float.NaN : 2F); // includes some nan values + record.setField( + "_some_double_nans", (i % 10 == 0) ? Double.NaN : 2D); // includes some nan values + record.setField("_no_nans", 3D); // optional, but always non-nan + + GenericRecord structNotNull = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE); + structNotNull.setField("_int_field", INT_MIN_VALUE + i); + record.setField("_struct_not_null", structNotNull); // struct with int + + appender.add(record); + } + } + + InputFile inFile = Files.localInput(orcFile); + try (Reader reader = + OrcFile.createReader( + new Path(inFile.location()), OrcFile.readerOptions(new Configuration()))) { + assertThat(reader.getStripes()).as("Should create only one stripe").hasSize(1); + } + + orcFile.deleteOnExit(); + } + + private void createParquetInputFile() throws IOException { + File parquetFile = new File(tempDir, "junit" + System.nanoTime()); + + // build struct field schema + org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(UNDERSCORE_STRUCT_FIELD_TYPE); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender appender = Parquet.write(outFile).schema(FILE_SCHEMA).build()) { + GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); + // create 50 records + for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { + builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 + builder.set( + "_no_stats_parquet", + TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats + // in Parquet + builder.set("_required", "req"); // required, always non-null + builder.set("_all_nulls", null); // never non-null + builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values + builder.set("_no_nulls", ""); // optional, but always non-null + builder.set("_all_nans", Double.NaN); // never non-nan + builder.set("_some_nans", (i % 10 == 0) ? Float.NaN : 2F); // includes some nan values + builder.set( + "_some_double_nans", (i % 10 == 0) ? Double.NaN : 2D); // includes some nan values + builder.set("_no_nans", 3D); // optional, but always non-nan + builder.set("_str", i + "str" + i); + + Record structNotNull = new Record(structSchema); + structNotNull.put("_int_field", INT_MIN_VALUE + i); + builder.set("_struct_not_null", structNotNull); // struct with int + + appender.add(builder.build()); + } + } + + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); + rowGroupMetadata = reader.getRowGroups().get(0); + parquetSchema = reader.getFileMetaData().getSchema(); + } + + parquetFile.deleteOnExit(); + } + + @TestTemplate + public void testAllNulls() { + boolean shouldRead; + + shouldRead = shouldRead(notNull("all_nulls")); + assertThat(shouldRead).as("Should skip: no non-null value in all null column").isFalse(); + + shouldRead = shouldRead(notNull("some_nulls")); + assertThat(shouldRead) + .as("Should read: column with some nulls contains a non-null value") + .isTrue(); + + shouldRead = shouldRead(notNull("no_nulls")); + assertThat(shouldRead).as("Should read: non-null column contains a non-null value").isTrue(); + + shouldRead = shouldRead(notNull("map_not_null")); + assertThat(shouldRead).as("Should read: map type is not skipped").isTrue(); + + shouldRead = shouldRead(notNull("struct_not_null")); + assertThat(shouldRead).as("Should read: struct type is not skipped").isTrue(); + } + + @TestTemplate + public void testNoNulls() { + boolean shouldRead = shouldRead(isNull("all_nulls")); + assertThat(shouldRead).as("Should read: at least one null value in all null column").isTrue(); + + shouldRead = shouldRead(isNull("some_nulls")); + assertThat(shouldRead).as("Should read: column with some nulls contains a null value").isTrue(); + + shouldRead = shouldRead(isNull("no_nulls")); + assertThat(shouldRead).as("Should skip: non-null column contains no null values").isFalse(); + + shouldRead = shouldRead(isNull("map_not_null")); + assertThat(shouldRead).as("Should read: map type is not skipped").isTrue(); + + shouldRead = shouldRead(isNull("struct_not_null")); + assertThat(shouldRead).as("Should read: struct type is not skipped").isTrue(); + } + + @TestTemplate + public void testFloatWithNan() { + // NaN's should break Parquet's Min/Max stats we should be reading in all cases + boolean shouldRead = shouldRead(greaterThan("some_nans", 1.0)); + assertThat(shouldRead).isTrue(); + + shouldRead = shouldRead(greaterThanOrEqual("some_nans", 1.0)); + assertThat(shouldRead).isTrue(); + + shouldRead = shouldRead(lessThan("some_nans", 3.0)); + assertThat(shouldRead).isTrue(); + + shouldRead = shouldRead(lessThanOrEqual("some_nans", 1.0)); + assertThat(shouldRead).isTrue(); + + shouldRead = shouldRead(equal("some_nans", 2.0)); + assertThat(shouldRead).isTrue(); + } + + @TestTemplate + public void testDoubleWithNan() { + boolean shouldRead = shouldRead(greaterThan("some_double_nans", 1.0)); + assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); + + shouldRead = shouldRead(greaterThanOrEqual("some_double_nans", 1.0)); + assertThat(shouldRead) + .as("Should read: column with some nans contains the target value") + .isTrue(); + + shouldRead = shouldRead(lessThan("some_double_nans", 3.0)); + assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); + + shouldRead = shouldRead(lessThanOrEqual("some_double_nans", 1.0)); + assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); + + shouldRead = shouldRead(equal("some_double_nans", 2.0)); + assertThat(shouldRead).as("Should read: column with some nans contains target value").isTrue(); + } + + @TestTemplate + public void testIsNaN() { + boolean shouldRead = shouldRead(isNaN("all_nans")); + assertThat(shouldRead) + .as("Should read: NaN counts are not tracked in Parquet metrics") + .isTrue(); + + shouldRead = shouldRead(isNaN("some_nans")); + assertThat(shouldRead) + .as("Should read: NaN counts are not tracked in Parquet metrics") + .isTrue(); + + shouldRead = shouldRead(isNaN("no_nans")); + switch (format) { + case ORC: + assertThat(shouldRead) + .as("Should read 0 rows due to the ORC filter push-down feature") + .isFalse(); + break; + case PARQUET: + assertThat(shouldRead) + .as("Should read: NaN counts are not tracked in Parquet metrics") + .isTrue(); + break; + default: + throw new UnsupportedOperationException( + "Row group filter tests not supported for " + format); + } + + shouldRead = shouldRead(isNaN("all_nulls")); + assertThat(shouldRead).as("Should skip: all null column will not contain nan value").isFalse(); + } + + @TestTemplate + public void testNotNaN() { + boolean shouldRead = shouldRead(notNaN("all_nans")); + assertThat(shouldRead) + .as("Should read: NaN counts are not tracked in Parquet metrics") + .isTrue(); + + shouldRead = shouldRead(notNaN("some_nans")); + assertThat(shouldRead) + .as("Should read: NaN counts are not tracked in Parquet metrics") + .isTrue(); + + shouldRead = shouldRead(notNaN("no_nans")); + assertThat(shouldRead) + .as("Should read: NaN counts are not tracked in Parquet metrics") + .isTrue(); + + shouldRead = shouldRead(notNaN("all_nulls")); + assertThat(shouldRead) + .as("Should read: NaN counts are not tracked in Parquet metrics") + .isTrue(); + } + + @TestTemplate + public void testRequiredColumn() { + boolean shouldRead = shouldRead(notNull("required")); + assertThat(shouldRead).as("Should read: required columns are always non-null").isTrue(); + + shouldRead = shouldRead(isNull("required")); + assertThat(shouldRead).as("Should skip: required columns are always non-null").isFalse(); + } + + @TestTemplate + public void testMissingColumn() { + assertThatThrownBy(() -> shouldRead(lessThan("missing", 5))) + .as("Should complain about missing column in expression") + .isInstanceOf(ValidationException.class) + .hasMessageStartingWith("Cannot find field 'missing'"); + } + + @TestTemplate + public void testColumnNotInFile() { + assumeThat(format) + .as( + "If a column is not in file, ORC does NOT try to apply predicates assuming null values for the column") + .isNotEqualTo(FileFormat.ORC); + + Expression[] cannotMatch = + new Expression[] { + lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), + equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), + greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file") + }; + + for (Expression expr : cannotMatch) { + boolean shouldRead = shouldRead(expr); + assertThat(shouldRead) + .as("Should skip when column is not in file (all nulls): " + expr) + .isFalse(); + } + + Expression[] canMatch = new Expression[] {isNull("not_in_file"), notEqual("not_in_file", 1.0f)}; + + for (Expression expr : canMatch) { + boolean shouldRead = shouldRead(expr); + assertThat(shouldRead) + .as("Should read when column is not in file (all nulls): " + expr) + .isTrue(); + } + } + + @TestTemplate + public void testMissingStatsParquet() { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + Expression[] exprs = + new Expression[] { + lessThan("no_stats_parquet", "a"), + lessThanOrEqual("no_stats_parquet", "b"), + equal("no_stats_parquet", "c"), + greaterThan("no_stats_parquet", "d"), + greaterThanOrEqual("no_stats_parquet", "e"), + notEqual("no_stats_parquet", "f"), + isNull("no_stats_parquet"), + notNull("no_stats_parquet"), + startsWith("no_stats_parquet", "a"), + notStartsWith("no_stats_parquet", "a") + }; + + for (Expression expr : exprs) { + boolean shouldRead = shouldRead(expr); + assertThat(shouldRead).as("Should read when missing stats for expr: " + expr).isTrue(); + } + } + + @TestTemplate + public void testZeroRecordFileParquet() { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + BlockMetaData emptyBlock = new BlockMetaData(); + emptyBlock.setRowCount(0); + + Expression[] exprs = + new Expression[] { + lessThan("id", 5), + lessThanOrEqual("id", 30), + equal("id", 70), + greaterThan("id", 78), + greaterThanOrEqual("id", 90), + notEqual("id", 101), + isNull("some_nulls"), + notNull("some_nulls") + }; + + for (Expression expr : exprs) { + boolean shouldRead = shouldReadParquet(expr, true, parquetSchema, emptyBlock); + assertThat(shouldRead).as("Should never read 0-record file: " + expr).isFalse(); + } + } + + @TestTemplate + public void testNot() { + // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out + boolean shouldRead = shouldRead(not(lessThan("id", INT_MIN_VALUE - 25))); + assertThat(shouldRead).as("Should read: not(false)").isTrue(); + + shouldRead = shouldRead(not(greaterThan("id", INT_MIN_VALUE - 25))); + assertThat(shouldRead).as("Should skip: not(true)").isFalse(); + } + + @TestTemplate + public void testAnd() { + // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out + boolean shouldRead = + shouldRead( + and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MIN_VALUE - 30))); + assertThat(shouldRead).as("Should skip: and(false, true)").isFalse(); + + shouldRead = + shouldRead( + and(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))); + assertThat(shouldRead).as("Should skip: and(false, false)").isFalse(); + + shouldRead = + shouldRead( + and(greaterThan("id", INT_MIN_VALUE - 25), lessThanOrEqual("id", INT_MIN_VALUE))); + assertThat(shouldRead).as("Should read: and(true, true)").isTrue(); + } + + @TestTemplate + public void testOr() { + // this test case must use a real predicate, not alwaysTrue(), or binding will simplify it out + boolean shouldRead = + shouldRead( + or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE + 1))); + assertThat(shouldRead).as("Should skip: or(false, false)").isFalse(); + + shouldRead = + shouldRead( + or(lessThan("id", INT_MIN_VALUE - 25), greaterThanOrEqual("id", INT_MAX_VALUE - 19))); + assertThat(shouldRead).as("Should read: or(false, true)").isTrue(); + } + + @TestTemplate + public void testIntegerLt() { + boolean shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); + + shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE)); + assertThat(shouldRead) + .as("Should not read: id range below lower bound (30 is not < 30)") + .isFalse(); + + shouldRead = shouldRead(lessThan("id", INT_MIN_VALUE + 1)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(lessThan("id", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @TestTemplate + public void testIntegerLtEq() { + boolean shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); + + shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should not read: id range below lower bound (29 < 30)").isFalse(); + + shouldRead = shouldRead(lessThanOrEqual("id", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(lessThanOrEqual("id", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: many possible ids").isTrue(); + } + + @TestTemplate + public void testIntegerGt() { + boolean shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); + + shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE)); + assertThat(shouldRead) + .as("Should not read: id range above upper bound (79 is not > 79)") + .isFalse(); + + shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE - 1)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(greaterThan("id", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @TestTemplate + public void testIntegerGtEq() { + boolean shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); + + shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should not read: id range above upper bound (80 > 79)").isFalse(); + + shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(greaterThanOrEqual("id", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @TestTemplate + public void testIntegerEq() { + boolean shouldRead = shouldRead(equal("id", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); + + shouldRead = shouldRead(equal("id", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); + + shouldRead = shouldRead(equal("id", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); + + shouldRead = shouldRead(equal("id", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); + + shouldRead = shouldRead(equal("id", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); + + shouldRead = shouldRead(equal("id", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); + + shouldRead = shouldRead(equal("id", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); + } + + @TestTemplate + public void testIntegerNotEq() { + boolean shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + } + + @TestTemplate + public void testIntegerNotEqRewritten() { + boolean shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE - 25))); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE - 1))); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MIN_VALUE))); + assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE - 4))); + assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE))); + assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE + 1))); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + + shouldRead = shouldRead(not(equal("id", INT_MAX_VALUE + 6))); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + } + + @TestTemplate + public void testStructFieldLt() { + boolean shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); + + shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE)); + assertThat(shouldRead) + .as("Should not read: id range below lower bound (30 is not < 30)") + .isFalse(); + + shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MIN_VALUE + 1)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(lessThan("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @TestTemplate + public void testStructFieldLtEq() { + boolean shouldRead = + shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id range below lower bound (5 < 30)").isFalse(); + + shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should not read: id range below lower bound (29 < 30)").isFalse(); + + shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(lessThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: many possible ids").isTrue(); + } + + @TestTemplate + public void testStructFieldGt() { + boolean shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); + + shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead) + .as("Should not read: id range above upper bound (79 is not > 79)") + .isFalse(); + + shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 1)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(greaterThan("struct_not_null.int_field", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @TestTemplate + public void testStructFieldGtEq() { + boolean shouldRead = + shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id range above upper bound (85 < 79)").isFalse(); + + shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should not read: id range above upper bound (80 > 79)").isFalse(); + + shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: one possible id").isTrue(); + + shouldRead = shouldRead(greaterThanOrEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: may possible ids").isTrue(); + } + + @TestTemplate + public void testStructFieldEq() { + boolean shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); + + shouldRead = shouldRead(equal("struct_not_null.int_field", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should not read: id above upper bound").isFalse(); + } + + @TestTemplate + public void testStructFieldNotEq() { + boolean shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE - 25)); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should read: id below lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound").isTrue(); + + shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE - 4)); + assertThat(shouldRead).as("Should read: id between lower and upper bounds").isTrue(); + + shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE)); + assertThat(shouldRead).as("Should read: id equal to upper bound").isTrue(); + + shouldRead = shouldRead(notEqual("id", INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + + shouldRead = shouldRead(notEqual("struct_not_null.int_field", INT_MAX_VALUE + 6)); + assertThat(shouldRead).as("Should read: id above upper bound").isTrue(); + } + + @TestTemplate + public void testCaseInsensitive() { + boolean shouldRead = shouldRead(equal("ID", INT_MIN_VALUE - 25), false); + assertThat(shouldRead).as("Should not read: id below lower bound").isFalse(); + } + + @TestTemplate + public void testStringStartsWith() { + assumeThat(format) + .as("ORC row group filter does not support StringStartsWith") + .isNotEqualTo(FileFormat.ORC); + + boolean shouldRead = shouldRead(startsWith("str", "1")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(startsWith("str", "0st")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(startsWith("str", "1str1")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(startsWith("str", "1str1_xgd")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(startsWith("str", "2str")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(startsWith("str", "9xstr")); + assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); + + shouldRead = shouldRead(startsWith("str", "0S")); + assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); + + shouldRead = shouldRead(startsWith("str", "x")); + assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); + + shouldRead = shouldRead(startsWith("str", "9str9aaa")); + assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); + } + + @TestTemplate + public void testStringNotStartsWith() { + assumeThat(format) + .as("ORC row group filter does not support StringStartsWith") + .isNotEqualTo(FileFormat.ORC); + + boolean shouldRead = shouldRead(notStartsWith("str", "1")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("str", "0st")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("str", "1str1")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("str", "1str1_xgd")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("str", "2str")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("str", "9xstr")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("required", "r")); + assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); + + shouldRead = shouldRead(notStartsWith("required", "requ")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("some_nulls", "ssome")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + + shouldRead = shouldRead(notStartsWith("some_nulls", "som")); + assertThat(shouldRead).as("Should read: range matches").isTrue(); + } + + @TestTemplate + public void testIntegerIn() { + boolean shouldRead = shouldRead(in("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)); + assertThat(shouldRead).as("Should not read: id below lower bound (5 < 30, 6 < 30)").isFalse(); + + shouldRead = shouldRead(in("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should not read: id below lower bound (28 < 30, 29 < 30)").isFalse(); + + shouldRead = shouldRead(in("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound (30 == 30)").isTrue(); + + shouldRead = shouldRead(in("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)); + assertThat(shouldRead) + .as("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)") + .isTrue(); + + shouldRead = shouldRead(in("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should read: id equal to upper bound (79 == 79)").isTrue(); + + shouldRead = shouldRead(in("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)); + assertThat(shouldRead).as("Should not read: id above upper bound (80 > 79, 81 > 79)").isFalse(); + + shouldRead = shouldRead(in("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)); + assertThat(shouldRead).as("Should not read: id above upper bound (85 > 79, 86 > 79)").isFalse(); + + shouldRead = shouldRead(in("all_nulls", 1, 2)); + assertThat(shouldRead).as("Should skip: in on all nulls column").isFalse(); + + shouldRead = shouldRead(in("some_nulls", "aaa", "some")); + assertThat(shouldRead).as("Should read: in on some nulls column").isTrue(); + + shouldRead = shouldRead(in("no_nulls", "aaa", "")); + assertThat(shouldRead).as("Should read: in on no nulls column").isTrue(); + } + + @TestTemplate + public void testIntegerNotIn() { + boolean shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 25, INT_MIN_VALUE - 24)); + assertThat(shouldRead).as("Should read: id below lower bound (5 < 30, 6 < 30)").isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 2, INT_MIN_VALUE - 1)); + assertThat(shouldRead).as("Should read: id below lower bound (28 < 30, 29 < 30)").isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MIN_VALUE - 1, INT_MIN_VALUE)); + assertThat(shouldRead).as("Should read: id equal to lower bound (30 == 30)").isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MAX_VALUE - 4, INT_MAX_VALUE - 3)); + assertThat(shouldRead) + .as("Should read: id between lower and upper bounds (30 < 75 < 79, 30 < 76 < 79)") + .isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MAX_VALUE, INT_MAX_VALUE + 1)); + assertThat(shouldRead).as("Should read: id equal to upper bound (79 == 79)").isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MAX_VALUE + 1, INT_MAX_VALUE + 2)); + assertThat(shouldRead).as("Should read: id above upper bound (80 > 79, 81 > 79)").isTrue(); + + shouldRead = shouldRead(notIn("id", INT_MAX_VALUE + 6, INT_MAX_VALUE + 7)); + assertThat(shouldRead).as("Should read: id above upper bound (85 > 79, 86 > 79)").isTrue(); + + shouldRead = shouldRead(notIn("all_nulls", 1, 2)); + assertThat(shouldRead).as("Should read: notIn on all nulls column").isTrue(); + + shouldRead = shouldRead(notIn("some_nulls", "aaa", "some")); + assertThat(shouldRead).as("Should read: notIn on some nulls column").isTrue(); + + shouldRead = shouldRead(notIn("no_nulls", "aaa", "")); + if (format == FileFormat.PARQUET) { + // no_nulls column has all values == "", so notIn("no_nulls", "") should always be false and + // so should be skipped + // However, the metrics evaluator in Parquets always reads row group for a notIn filter + assertThat(shouldRead).as("Should read: notIn on no nulls column").isTrue(); + } else { + assertThat(shouldRead).as("Should skip: notIn on no nulls column").isFalse(); + } + } + + @TestTemplate + public void testSomeNullsNotEq() { + boolean shouldRead = shouldRead(notEqual("some_nulls", "some")); + assertThat(shouldRead).as("Should read: notEqual on some nulls column").isTrue(); + } + + @TestTemplate + public void testInLimitParquet() { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + boolean shouldRead = shouldRead(in("id", 1, 2)); + assertThat(shouldRead).as("Should not read if IN is evaluated").isFalse(); + + List ids = Lists.newArrayListWithExpectedSize(400); + for (int id = -400; id <= 0; id++) { + ids.add(id); + } + + shouldRead = shouldRead(in("id", ids)); + assertThat(shouldRead).as("Should read if IN is not evaluated").isTrue(); + } + + @TestTemplate + public void testParquetTypePromotion() { + assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET); + + Schema promotedSchema = new Schema(required(1, "id", Types.LongType.get())); + boolean shouldRead = + new ParquetMetricsRowGroupFilter(promotedSchema, equal("id", INT_MIN_VALUE + 1), true) + .shouldRead(parquetSchema, rowGroupMetadata); + assertThat(shouldRead).as("Should succeed with promoted schema").isTrue(); + } + + @TestTemplate + public void testTransformFilter() { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + boolean shouldRead = + new ParquetMetricsRowGroupFilter(SCHEMA, equal(truncate("required", 2), "some_value"), true) + .shouldRead(parquetSchema, rowGroupMetadata); + assertThat(shouldRead) + .as("Should read: filter contains non-reference evaluate as True") + .isTrue(); + } + + @TestTemplate + public void testVariantFilterNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + Schema variantSchema = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "variant_field", Types.VariantType.get())); + + File parquetFile = new File(tempDir, "test-variant" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender appender = + Parquet.write(outFile) + .schema(variantSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(variantSchema); + record.setField("id", i); + + if (i % 2 == 0) { + VariantMetadata metadata = Variants.metadata("field"); + ShreddedObject obj = Variants.object(metadata); + obj.put("field", Variants.of("value" + i)); + Variant variant = Variant.of(metadata, obj); + record.setField("variant_field", variant); + } + + appender.add(record); + } + } + + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + + ParquetMetricsRowGroupFilter rowGroupFilter = + new ParquetMetricsRowGroupFilter(variantSchema, notNull("variant_field"), true); + boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan") + .isTrue(); + } + parquetFile.deleteOnExit(); + } + + @TestTemplate + public void testAllNullsVariantNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + Schema variantSchema = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "variant_field", Types.VariantType.get())); + + File parquetFile = new File(tempDir, "test-variant-nulls" + System.nanoTime()); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender appender = + Parquet.write(outFile) + .schema(variantSchema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(variantSchema); + record.setField("id", i); + record.setField("variant_field", null); + appender.add(record); + } + } + + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + + ParquetMetricsRowGroupFilter rowGroupFilter = + new ParquetMetricsRowGroupFilter(variantSchema, notNull("variant_field"), true); + boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan even for all nulls") + .isTrue(); + } + parquetFile.deleteOnExit(); + } + + private boolean shouldRead(Expression expression) { + return shouldRead(expression, true); + } + + private boolean shouldRead(Expression expression, boolean caseSensitive) { + switch (format) { + case ORC: + return shouldReadOrc(expression, caseSensitive); + case PARQUET: + return shouldReadParquet(expression, caseSensitive, parquetSchema, rowGroupMetadata); + default: + throw new UnsupportedOperationException( + "Row group filter tests not supported for " + format); + } + } + + private boolean shouldReadOrc(Expression expression, boolean caseSensitive) { + try (CloseableIterable reader = + ORC.read(Files.localInput(orcFile)) + .project(SCHEMA) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) + .filter(expression) + .caseSensitive(caseSensitive) + .build()) { + return !Lists.newArrayList(reader).isEmpty(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private boolean shouldReadParquet( + Expression expression, + boolean caseSensitive, + MessageType messageType, + BlockMetaData blockMetaData) { + return new ParquetMetricsRowGroupFilter(SCHEMA, expression, caseSensitive) + .shouldRead(messageType, blockMetaData); + } + + private org.apache.parquet.io.InputFile parquetInputFile(InputFile inFile) { + return new org.apache.parquet.io.InputFile() { + @Override + public long getLength() throws IOException { + return inFile.getLength(); + } + + @Override + public org.apache.parquet.io.SeekableInputStream newStream() throws IOException { + SeekableInputStream stream = inFile.newStream(); + return new DelegatingSeekableInputStream(stream) { + @Override + public long getPos() throws IOException { + return stream.getPos(); + } + + @Override + public void seek(long newPos) throws IOException { + stream.seek(newPos); + } + }; + } + }; + } +} diff --git a/data/src/test/java/org/apache/iceberg/data/orc/TestOrcMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/orc/TestOrcMetricsRowGroupFilter.java deleted file mode 100644 index 5a2ff2fb6727..000000000000 --- a/data/src/test/java/org/apache/iceberg/data/orc/TestOrcMetricsRowGroupFilter.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.data.orc; - -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; -import java.io.IOException; -import java.io.UncheckedIOException; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.Files; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.MetricsRowGroupFilterTestBase; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.orc.OrcFile; -import org.apache.orc.Reader; -import org.junit.jupiter.api.Test; - -public class TestOrcMetricsRowGroupFilter extends MetricsRowGroupFilterTestBase { - - private File orcFile = null; - - @Override - protected void createInputFile() throws IOException { - this.orcFile = new File(tempDir, "junit" + System.nanoTime()); - - OutputFile outFile = Files.localOutput(orcFile); - try (FileAppender appender = - ORC.write(outFile) - .schema(BASE_FILE_SCHEMA) - .createWriterFunc(GenericOrcWriter::buildWriter) - .build()) { - GenericRecord record = GenericRecord.create(BASE_FILE_SCHEMA); - for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { - populateBaseFields(record, i); - appender.add(record); - } - } - InputFile inFile = Files.localInput(orcFile); - try (Reader reader = - OrcFile.createReader( - new Path(inFile.location()), OrcFile.readerOptions(new Configuration()))) { - assertThat(reader.getStripes()).as("Should create only one stripe").hasSize(1); - } - orcFile.deleteOnExit(); - } - - @Override - protected boolean shouldRead(Expression expression, boolean caseSensitive) { - try (CloseableIterable reader = - ORC.read(Files.localInput(orcFile)) - .project(BASE_SCHEMA) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(BASE_SCHEMA, fileSchema)) - .filter(expression) - .caseSensitive(caseSensitive) - .build()) { - return !Lists.newArrayList(reader).isEmpty(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - @Test - public void testIsNaNNoNans() { - boolean shouldRead = shouldRead(isNaN("no_nans")); - assertThat(shouldRead).as("ORC filter push-down can skip groups with no NaN").isFalse(); - } - - @Test - public void testNotInNoNulls() { - boolean shouldRead = shouldRead(notIn("no_nulls", "aaa", "")); - assertThat(shouldRead).as("ORC can skip when all values match excluded set").isFalse(); - } -} diff --git a/data/src/test/java/org/apache/iceberg/data/parquet/TestParquetMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/parquet/TestParquetMetricsRowGroupFilter.java deleted file mode 100644 index 94d098f10a25..000000000000 --- a/data/src/test/java/org/apache/iceberg/data/parquet/TestParquetMetricsRowGroupFilter.java +++ /dev/null @@ -1,444 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.data.parquet; - -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.notStartsWith; -import static org.apache.iceberg.expressions.Expressions.startsWith; -import static org.apache.iceberg.expressions.Expressions.truncate; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.UUID; -import org.apache.iceberg.Files; -import org.apache.iceberg.Schema; -import org.apache.iceberg.data.GenericRecord; -import org.apache.iceberg.data.MetricsRowGroupFilterTestBase; -import org.apache.iceberg.exceptions.ValidationException; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.io.OutputFile; -import org.apache.iceberg.io.SeekableInputStream; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.parquet.ParquetMetricsRowGroupFilter; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.apache.iceberg.types.Types.StringType; -import org.apache.iceberg.variants.ShreddedObject; -import org.apache.iceberg.variants.Variant; -import org.apache.iceberg.variants.VariantMetadata; -import org.apache.iceberg.variants.Variants; -import org.apache.parquet.hadoop.ParquetFileReader; -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.io.DelegatingSeekableInputStream; -import org.apache.parquet.schema.MessageType; -import org.junit.jupiter.api.Test; - -public class TestParquetMetricsRowGroupFilter extends MetricsRowGroupFilterTestBase { - - private static final Schema PARQUET_SCHEMA = - new Schema( - Lists.newArrayList( - Iterables.concat( - BASE_SCHEMA.columns(), - Lists.newArrayList( - optional(17, "no_stats_parquet", StringType.get()), - optional(18, "some_nulls_variant", Types.VariantType.get()), - optional(19, "all_nulls_variant", Types.VariantType.get()))))); - - private static final Schema PARQUET_FILE_SCHEMA = - new Schema( - Lists.newArrayList( - Iterables.concat( - BASE_FILE_SCHEMA.columns(), - Lists.newArrayList( - optional(17, "_no_stats_parquet", StringType.get()), - optional(18, "_some_nulls_variant", Types.VariantType.get()), - optional(19, "_all_nulls_variant", Types.VariantType.get()))))); - - private MessageType parquetSchema = null; - private BlockMetaData rowGroupMetadata = null; - - @Override - protected void createInputFile() throws IOException { - File parquetFile = new File(tempDir, "junit" + System.nanoTime()); - - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < 200; i += 1) { - sb.append(UUID.randomUUID()); - } - String tooLongForParquetValue = sb.toString(); - - OutputFile outFile = Files.localOutput(parquetFile); - try (FileAppender appender = - Parquet.write(outFile) - .schema(PARQUET_FILE_SCHEMA) - .createWriterFunc(GenericParquetWriter::create) - .build()) { - for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { - GenericRecord record = GenericRecord.create(PARQUET_FILE_SCHEMA); - populateBaseFields(record, i); - record.setField( - "_no_stats_parquet", - tooLongForParquetValue); // value longer than 4k will produce no stats - if (i % 2 == 0) { - VariantMetadata metadata = Variants.metadata("data"); - ShreddedObject obj = Variants.object(metadata); - obj.put("data", Variants.of(25 + i)); - record.setField("_some_nulls_variant", Variant.of(metadata, obj)); - } else { - record.setField("_some_nulls_variant", null); - } - record.setField("_all_nulls_variant", null); - appender.add(record); - } - } - - InputFile inFile = Files.localInput(parquetFile); - try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { - assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); - rowGroupMetadata = reader.getRowGroups().get(0); - parquetSchema = reader.getFileMetaData().getSchema(); - } - parquetFile.deleteOnExit(); - } - - @Override - protected boolean shouldRead(Expression expression, boolean caseSensitive) { - return shouldRead(expression, caseSensitive, rowGroupMetadata); - } - - @Test - public void testIsNaNNoNans() { - boolean shouldRead = shouldRead(isNaN("no_nans")); - assertThat(shouldRead).as("Parquet metrics don't track NaN counts").isTrue(); - } - - @Test - public void testNotInNoNulls() { - boolean shouldRead = shouldRead(notIn("no_nulls", "aaa", "")); - assertThat(shouldRead).as("Parquet metrics evaluator reads for notIn").isTrue(); - } - - @Test - public void testColumnNotInFile() { - Expression[] cannotMatch = - new Expression[] { - lessThan("not_in_file", 1.0f), lessThanOrEqual("not_in_file", 1.0f), - equal("not_in_file", 1.0f), greaterThan("not_in_file", 1.0f), - greaterThanOrEqual("not_in_file", 1.0f), notNull("not_in_file") - }; - - for (Expression expr : cannotMatch) { - boolean shouldRead = shouldRead(expr); - assertThat(shouldRead) - .as("Should skip when column is not in file (all nulls): " + expr) - .isFalse(); - } - - Expression[] canMatch = new Expression[] {isNull("not_in_file"), notEqual("not_in_file", 1.0f)}; - - for (Expression expr : canMatch) { - boolean shouldRead = shouldRead(expr); - assertThat(shouldRead) - .as("Should read when column is not in file (all nulls): " + expr) - .isTrue(); - } - } - - @Test - public void testMissingStatsParquet() { - Expression[] exprs = - new Expression[] { - lessThan("no_stats_parquet", "a"), - lessThanOrEqual("no_stats_parquet", "b"), - equal("no_stats_parquet", "c"), - greaterThan("no_stats_parquet", "d"), - greaterThanOrEqual("no_stats_parquet", "e"), - notEqual("no_stats_parquet", "f"), - isNull("no_stats_parquet"), - notNull("no_stats_parquet"), - startsWith("no_stats_parquet", "a"), - notStartsWith("no_stats_parquet", "a") - }; - - for (Expression expr : exprs) { - boolean shouldRead = shouldRead(expr); - assertThat(shouldRead).as("Should read when missing stats for expr: " + expr).isTrue(); - } - } - - @Test - public void testZeroRecordFileParquet() { - BlockMetaData emptyBlock = new BlockMetaData(); - emptyBlock.setRowCount(0); - - Expression[] exprs = - new Expression[] { - lessThan("id", 5), - lessThanOrEqual("id", 30), - equal("id", 70), - greaterThan("id", 78), - greaterThanOrEqual("id", 90), - notEqual("id", 101), - isNull("some_nulls"), - notNull("some_nulls") - }; - - for (Expression expr : exprs) { - boolean shouldRead = shouldRead(expr, true, emptyBlock); - assertThat(shouldRead).as("Should never read 0-record file: " + expr).isFalse(); - } - } - - @Test - public void testStringStartsWith() { - boolean shouldRead = shouldRead(startsWith("str", "1")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(startsWith("str", "0st")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(startsWith("str", "1str1")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(startsWith("str", "1str1_xgd")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(startsWith("str", "2str")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(startsWith("str", "9xstr")); - assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); - - shouldRead = shouldRead(startsWith("str", "0S")); - assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); - - shouldRead = shouldRead(startsWith("str", "x")); - assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); - - shouldRead = shouldRead(startsWith("str", "9str9aaa")); - assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); - } - - @Test - public void testStringNotStartsWith() { - boolean shouldRead = shouldRead(notStartsWith("str", "1")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("str", "0st")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("str", "1str1")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("str", "1str1_xgd")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("str", "2str")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("str", "9xstr")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("required", "r")); - assertThat(shouldRead).as("Should not read: range doesn't match").isFalse(); - - shouldRead = shouldRead(notStartsWith("required", "requ")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("some_nulls", "ssome")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - - shouldRead = shouldRead(notStartsWith("some_nulls", "som")); - assertThat(shouldRead).as("Should read: range matches").isTrue(); - } - - @Test - public void testInLimitParquet() { - boolean shouldRead = shouldRead(in("id", 1, 2)); - assertThat(shouldRead).as("Should not read if IN is evaluated").isFalse(); - - List ids = Lists.newArrayListWithExpectedSize(400); - for (int id = -400; id <= 0; id++) { - ids.add(id); - } - - shouldRead = shouldRead(in("id", ids)); - assertThat(shouldRead).as("Should read if IN is not evaluated").isTrue(); - } - - @Test - public void testParquetTypePromotion() { - Schema promotedSchema = new Schema(Types.NestedField.required(1, "id", Types.LongType.get())); - boolean shouldRead = - new ParquetMetricsRowGroupFilter(promotedSchema, equal("id", INT_MIN_VALUE + 1), true) - .shouldRead(parquetSchema, rowGroupMetadata); - assertThat(shouldRead).as("Should succeed with promoted schema").isTrue(); - } - - @Test - public void testTransformFilter() { - boolean shouldRead = - new ParquetMetricsRowGroupFilter( - PARQUET_SCHEMA, equal(truncate("required", 2), "some_value"), true) - .shouldRead(parquetSchema, rowGroupMetadata); - assertThat(shouldRead) - .as("Should read: filter contains non-reference evaluate as True") - .isTrue(); - } - - @Test - public void testVariantNotNull() { - boolean shouldRead = shouldRead(notNull("some_nulls_variant")); - assertThat(shouldRead) - .as("Should read: variant notNull filters must be evaluated post scan") - .isTrue(); - - shouldRead = shouldRead(notNull("all_nulls_variant")); - assertThat(shouldRead) - .as("Should read: variant notNull filters must be evaluated post scan even for all nulls") - .isTrue(); - } - - @Test - public void testVariantEq() { - assertThatThrownBy(() -> shouldRead(equal("some_nulls_variant", "test"))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("test"); - } - - @Test - public void testVariantIn() { - assertThatThrownBy(() -> shouldRead(in("some_nulls_variant", "test1", "test2"))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("test1"); - - assertThatThrownBy(() -> shouldRead(in("all_nulls_variant", "test1", "test2"))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("test1"); - } - - @Test - public void testVariantNotIn() { - assertThatThrownBy(() -> shouldRead(notIn("some_nulls_variant", "test1", "test2"))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("test1"); - - assertThatThrownBy(() -> shouldRead(notIn("all_nulls_variant", "test1", "test2"))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("test1"); - } - - @Test - public void testVariantIsNull() { - boolean shouldRead = shouldRead(isNull("some_nulls_variant")); - assertThat(shouldRead) - .as("Should read: variant isNull filters must be evaluated post scan") - .isTrue(); - - shouldRead = shouldRead(isNull("all_nulls_variant")); - assertThat(shouldRead) - .as("Should read: variant isNull filters must be evaluated post scan even for all nulls") - .isTrue(); - } - - @Test - public void testVariantComparisons() { - assertThatThrownBy(() -> shouldRead(lessThan("some_nulls_variant", 42))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("42"); - - assertThatThrownBy(() -> shouldRead(lessThanOrEqual("some_nulls_variant", 42))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("42"); - - assertThatThrownBy(() -> shouldRead(greaterThan("some_nulls_variant", 42))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("42"); - - assertThatThrownBy(() -> shouldRead(greaterThanOrEqual("some_nulls_variant", 42))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("42"); - - assertThatThrownBy(() -> shouldRead(notEqual("some_nulls_variant", 42))) - .isInstanceOf(ValidationException.class) - .hasMessageContaining("Invalid value for conversion to type variant:") - .hasMessageContaining("42"); - } - - private boolean shouldRead(Expression expression, boolean caseSensitive, BlockMetaData block) { - if (parquetSchema == null || block == null) { - throw new IllegalStateException( - "Parquet file not created yet. Call createInputFile() first."); - } - return new ParquetMetricsRowGroupFilter(PARQUET_SCHEMA, expression, caseSensitive) - .shouldRead(parquetSchema, block); - } - - private org.apache.parquet.io.InputFile parquetInputFile(InputFile inFile) { - return new org.apache.parquet.io.InputFile() { - @Override - public long getLength() throws IOException { - return inFile.getLength(); - } - - @Override - public org.apache.parquet.io.SeekableInputStream newStream() throws IOException { - SeekableInputStream stream = inFile.newStream(); - return new DelegatingSeekableInputStream(stream) { - @Override - public long getPos() throws IOException { - return stream.getPos(); - } - - @Override - public void seek(long newPos) throws IOException { - stream.seek(newPos); - } - }; - } - }; - } -} From c7c715e2c8139c68538055dfa3a57390f8770b93 Mon Sep 17 00:00:00 2001 From: geruh Date: Wed, 17 Sep 2025 18:14:36 -0700 Subject: [PATCH 5/9] Revert the test refactoring and clean up original --- .../data/TestMetricsRowGroupFilter.java | 144 +++++++++++++----- 1 file changed, 108 insertions(+), 36 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index a475fecf004b..54ab97dd4f9e 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -18,7 +18,6 @@ */ package org.apache.iceberg.data; -import static org.apache.iceberg.avro.AvroSchemaUtil.convert; import static org.apache.iceberg.expressions.Expressions.and; import static org.apache.iceberg.expressions.Expressions.equal; import static org.apache.iceberg.expressions.Expressions.greaterThan; @@ -49,8 +48,6 @@ import java.util.Arrays; import java.util.List; import java.util.UUID; -import org.apache.avro.generic.GenericData.Record; -import org.apache.avro.generic.GenericRecordBuilder; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.iceberg.FileFormat; @@ -59,7 +56,6 @@ import org.apache.iceberg.ParameterizedTestExtension; import org.apache.iceberg.Parameters; import org.apache.iceberg.Schema; -import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.data.orc.GenericOrcReader; import org.apache.iceberg.data.orc.GenericOrcWriter; import org.apache.iceberg.data.parquet.GenericParquetWriter; @@ -143,6 +139,11 @@ public static List parameters() { optional(16, "_no_nans", Types.DoubleType.get()), optional(17, "_some_double_nans", Types.DoubleType.get())); + private static final Schema VARIANT_SCHEMA = + new Schema( + required(1, "id", IntegerType.get()), + optional(2, "variant_field", Types.VariantType.get())); + private static final String TOO_LONG_FOR_STATS_PARQUET; static { @@ -225,40 +226,32 @@ public void createOrcInputFile() throws IOException { } private void createParquetInputFile() throws IOException { - File parquetFile = new File(tempDir, "junit" + System.nanoTime()); - - // build struct field schema - org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(UNDERSCORE_STRUCT_FIELD_TYPE); - - OutputFile outFile = Files.localOutput(parquetFile); - try (FileAppender appender = Parquet.write(outFile).schema(FILE_SCHEMA).build()) { - GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); - // create 50 records - for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { - builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 - builder.set( - "_no_stats_parquet", - TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats - // in Parquet - builder.set("_required", "req"); // required, always non-null - builder.set("_all_nulls", null); // never non-null - builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values - builder.set("_no_nulls", ""); // optional, but always non-null - builder.set("_all_nans", Double.NaN); // never non-nan - builder.set("_some_nans", (i % 10 == 0) ? Float.NaN : 2F); // includes some nan values - builder.set( - "_some_double_nans", (i % 10 == 0) ? Double.NaN : 2D); // includes some nan values - builder.set("_no_nans", 3D); // optional, but always non-nan - builder.set("_str", i + "str" + i); - - Record structNotNull = new Record(structSchema); - structNotNull.put("_int_field", INT_MIN_VALUE + i); - builder.set("_struct_not_null", structNotNull); // struct with int - - appender.add(builder.build()); - } + List records = Lists.newArrayList(); + + for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { + GenericRecord builder = GenericRecord.create(FILE_SCHEMA); + builder.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 + builder.setField( + "_no_stats_parquet", + TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats + // in Parquet + builder.setField("_required", "req"); // required, always non-null + builder.setField("_all_nulls", null); // never non-null + builder.setField("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values + builder.setField("_no_nulls", ""); // optional, but always non-null + builder.setField("_all_nans", Double.NaN); // never non-nan + builder.setField("_some_nans", (i % 10 == 0) ? Float.NaN : 2F); // includes some nan values + builder.setField( + "_some_double_nans", (i % 10 == 0) ? Double.NaN : 2D); // includes some nan values + builder.setField("_no_nans", 3D); // optional, but always non-nan + builder.setField("_str", i + "str" + i); + GenericRecord structNotNull = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE); + structNotNull.setField("_int_field", INT_MIN_VALUE + i); + builder.setField("_struct_not_null", structNotNull); // struct with int + records.add(builder); } + File parquetFile = writeParquetFile("junit", FILE_SCHEMA, records); InputFile inFile = Files.localInput(parquetFile); try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); @@ -269,6 +262,24 @@ private void createParquetInputFile() throws IOException { parquetFile.deleteOnExit(); } + private File writeParquetFile(String fileName, Schema schema, List records) + throws IOException { + File parquetFile = new File(tempDir, fileName + System.nanoTime()); + + OutputFile outFile = Files.localOutput(parquetFile); + try (FileAppender appender = + Parquet.write(outFile) + .schema(schema) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + for (GenericRecord record : records) { + appender.add(record); + } + } + parquetFile.deleteOnExit(); + return parquetFile; + } + @TestTemplate public void testAllNulls() { boolean shouldRead; @@ -1084,6 +1095,67 @@ public void testAllNullsVariantNotNull() throws IOException { parquetFile.deleteOnExit(); } + @TestTemplate + public void testVariantFieldMixedValuesNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + List records = Lists.newArrayList(); + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(VARIANT_SCHEMA); + record.setField("id", i); + if (i % 2 == 0) { + VariantMetadata metadata = Variants.metadata("field"); + ShreddedObject obj = Variants.object(metadata); + obj.put("field", Variants.of("value" + i)); + Variant variant = Variant.of(metadata, obj); + record.setField("variant_field", variant); + } + records.add(record); + } + + File parquetFile = writeParquetFile("test-variant", VARIANT_SCHEMA, records); + InputFile inFile = Files.localInput(parquetFile); + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + ParquetMetricsRowGroupFilter rowGroupFilter = + new ParquetMetricsRowGroupFilter(VARIANT_SCHEMA, notNull("variant_field"), true); + boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan") + .isTrue(); + } + } + + @TestTemplate + public void testVariantFieldAllNullsNotNull() throws IOException { + assumeThat(format).isEqualTo(FileFormat.PARQUET); + + List records = Lists.newArrayListWithExpectedSize(10); + for (int i = 0; i < 10; i++) { + GenericRecord record = GenericRecord.create(VARIANT_SCHEMA); + record.setField("id", i); + record.setField("variant_field", null); + records.add(record); + } + + File parquetFile = writeParquetFile("test-variant-nulls", VARIANT_SCHEMA, records); + InputFile inFile = Files.localInput(parquetFile); + + try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { + BlockMetaData blockMetaData = reader.getRowGroups().get(0); + MessageType fileSchema = reader.getFileMetaData().getSchema(); + + ParquetMetricsRowGroupFilter rowGroupFilter = + new ParquetMetricsRowGroupFilter(VARIANT_SCHEMA, notNull("variant_field"), true); + boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); + + assertThat(shouldRead) + .as("Should read: variant notNull filters must be evaluated post scan even for all nulls") + .isTrue(); + } + } + private boolean shouldRead(Expression expression) { return shouldRead(expression, true); } From 3f06a3d56e0e480c76e899555607c2a542468141 Mon Sep 17 00:00:00 2001 From: geruh Date: Thu, 18 Sep 2025 16:51:42 -0700 Subject: [PATCH 6/9] remove old tests --- .../data/TestMetricsRowGroupFilter.java | 91 ------------------- 1 file changed, 91 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index 54ab97dd4f9e..3c867a9a72d1 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -1004,97 +1004,6 @@ public void testTransformFilter() { .isTrue(); } - @TestTemplate - public void testVariantFilterNotNull() throws IOException { - assumeThat(format).isEqualTo(FileFormat.PARQUET); - - Schema variantSchema = - new Schema( - required(1, "id", IntegerType.get()), - optional(2, "variant_field", Types.VariantType.get())); - - File parquetFile = new File(tempDir, "test-variant" + System.nanoTime()); - - OutputFile outFile = Files.localOutput(parquetFile); - try (FileAppender appender = - Parquet.write(outFile) - .schema(variantSchema) - .createWriterFunc(GenericParquetWriter::create) - .build()) { - - for (int i = 0; i < 10; i++) { - GenericRecord record = GenericRecord.create(variantSchema); - record.setField("id", i); - - if (i % 2 == 0) { - VariantMetadata metadata = Variants.metadata("field"); - ShreddedObject obj = Variants.object(metadata); - obj.put("field", Variants.of("value" + i)); - Variant variant = Variant.of(metadata, obj); - record.setField("variant_field", variant); - } - - appender.add(record); - } - } - - InputFile inFile = Files.localInput(parquetFile); - try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { - assertThat(reader.getRowGroups()).as("Should create only one row group").hasSize(1); - BlockMetaData blockMetaData = reader.getRowGroups().get(0); - MessageType fileSchema = reader.getFileMetaData().getSchema(); - - ParquetMetricsRowGroupFilter rowGroupFilter = - new ParquetMetricsRowGroupFilter(variantSchema, notNull("variant_field"), true); - boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); - assertThat(shouldRead) - .as("Should read: variant notNull filters must be evaluated post scan") - .isTrue(); - } - parquetFile.deleteOnExit(); - } - - @TestTemplate - public void testAllNullsVariantNotNull() throws IOException { - assumeThat(format).isEqualTo(FileFormat.PARQUET); - - Schema variantSchema = - new Schema( - required(1, "id", IntegerType.get()), - optional(2, "variant_field", Types.VariantType.get())); - - File parquetFile = new File(tempDir, "test-variant-nulls" + System.nanoTime()); - - OutputFile outFile = Files.localOutput(parquetFile); - try (FileAppender appender = - Parquet.write(outFile) - .schema(variantSchema) - .createWriterFunc(GenericParquetWriter::create) - .build()) { - - for (int i = 0; i < 10; i++) { - GenericRecord record = GenericRecord.create(variantSchema); - record.setField("id", i); - record.setField("variant_field", null); - appender.add(record); - } - } - - InputFile inFile = Files.localInput(parquetFile); - try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { - BlockMetaData blockMetaData = reader.getRowGroups().get(0); - MessageType fileSchema = reader.getFileMetaData().getSchema(); - - ParquetMetricsRowGroupFilter rowGroupFilter = - new ParquetMetricsRowGroupFilter(variantSchema, notNull("variant_field"), true); - boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); - assertThat(shouldRead) - .as("Should read: variant notNull filters must be evaluated post scan even for all nulls") - .isTrue(); - } - parquetFile.deleteOnExit(); - } - @TestTemplate public void testVariantFieldMixedValuesNotNull() throws IOException { assumeThat(format).isEqualTo(FileFormat.PARQUET); From 3ba38bb590b89ae8c0117298f1b4446aad801159 Mon Sep 17 00:00:00 2001 From: geruh Date: Thu, 18 Sep 2025 18:26:33 -0700 Subject: [PATCH 7/9] clean up and add another test --- .../apache/iceberg/spark/SparkTestHelperBase.java | 4 ++-- .../iceberg/spark/sql/TestFilterPushDown.java | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java index 70286c837cdd..cfb4980df6e4 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java @@ -81,8 +81,8 @@ protected void assertEquals(String context, Object[] expectedRow, Object[] actua assertEquals(newContext, (Object[]) expectedValue, (Object[]) actualValue); } } else if (expectedValue instanceof VariantVal && actualValue instanceof VariantVal) { - // Spark VariantVal may have fixed-width padding, so we compare their JSON representation - // instead of raw byte arrays. + // Spark VariantVal comparison is based on raw byte[] comparison, which can fail + // if Spark uses trailing null bytes. so, we compare their JSON representation instead. assertThat((actualValue).toString()) .as("%s contents should match (VariantVal JSON)", context) .isEqualTo((expectedValue).toString()); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java index ee45958ab47f..0ebfde782e4f 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java @@ -604,6 +604,21 @@ public void testVariantExtractFiltering() { withDefaultTimeZone( "UTC", () -> { + checkFilters( + "try_variant_get(data, '$.num', 'int') IS NOT NULL", + "isnotnull(data) AND isnotnull(try_variant_get(data, $.num, IntegerType, false, Some(UTC)))", + "data IS NOT NULL", + ImmutableList.of( + row(1L, toSparkVariantRow("foo", 25)), + row(2L, toSparkVariantRow("bar", 30)), + row(3L, toSparkVariantRow("baz", 35)))); + + checkFilters( + "try_variant_get(data, '$.num', 'int') IS NULL", + "isnull(try_variant_get(data, $.num, IntegerType, false, Some(UTC)))", + "", + ImmutableList.of(row(4L, null))); + checkFilters( "try_variant_get(data, '$.num', 'int') > 30", "isnotnull(data) AND (try_variant_get(data, $.num, IntegerType, false, Some(UTC)) > 30)", From a618461215ccac685585dd294d0a15e0a16b8cc6 Mon Sep 17 00:00:00 2001 From: geruh Date: Thu, 18 Sep 2025 18:33:57 -0700 Subject: [PATCH 8/9] checkstyle.. --- .../iceberg/spark/sql/TestFilterPushDown.java | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java index 0ebfde782e4f..a984c4c826d2 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestFilterPushDown.java @@ -605,19 +605,19 @@ public void testVariantExtractFiltering() { "UTC", () -> { checkFilters( - "try_variant_get(data, '$.num', 'int') IS NOT NULL", - "isnotnull(data) AND isnotnull(try_variant_get(data, $.num, IntegerType, false, Some(UTC)))", - "data IS NOT NULL", - ImmutableList.of( - row(1L, toSparkVariantRow("foo", 25)), - row(2L, toSparkVariantRow("bar", 30)), - row(3L, toSparkVariantRow("baz", 35)))); + "try_variant_get(data, '$.num', 'int') IS NOT NULL", + "isnotnull(data) AND isnotnull(try_variant_get(data, $.num, IntegerType, false, Some(UTC)))", + "data IS NOT NULL", + ImmutableList.of( + row(1L, toSparkVariantRow("foo", 25)), + row(2L, toSparkVariantRow("bar", 30)), + row(3L, toSparkVariantRow("baz", 35)))); checkFilters( - "try_variant_get(data, '$.num', 'int') IS NULL", - "isnull(try_variant_get(data, $.num, IntegerType, false, Some(UTC)))", - "", - ImmutableList.of(row(4L, null))); + "try_variant_get(data, '$.num', 'int') IS NULL", + "isnull(try_variant_get(data, $.num, IntegerType, false, Some(UTC)))", + "", + ImmutableList.of(row(4L, null))); checkFilters( "try_variant_get(data, '$.num', 'int') > 30", From a48ec72848b79bea1b6e299675a33ed5ace34e5e Mon Sep 17 00:00:00 2001 From: geruh Date: Fri, 19 Sep 2025 18:02:58 -0700 Subject: [PATCH 9/9] address comments --- .../apache/iceberg/data/TestMetricsRowGroupFilter.java | 8 +++----- .../org/apache/iceberg/spark/SparkTestHelperBase.java | 3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index 3c867a9a72d1..e12015d5eb73 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -1029,8 +1029,8 @@ public void testVariantFieldMixedValuesNotNull() throws IOException { MessageType fileSchema = reader.getFileMetaData().getSchema(); ParquetMetricsRowGroupFilter rowGroupFilter = new ParquetMetricsRowGroupFilter(VARIANT_SCHEMA, notNull("variant_field"), true); - boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); - assertThat(shouldRead) + + assertThat(rowGroupFilter.shouldRead(fileSchema, blockMetaData)) .as("Should read: variant notNull filters must be evaluated post scan") .isTrue(); } @@ -1054,12 +1054,10 @@ public void testVariantFieldAllNullsNotNull() throws IOException { try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { BlockMetaData blockMetaData = reader.getRowGroups().get(0); MessageType fileSchema = reader.getFileMetaData().getSchema(); - ParquetMetricsRowGroupFilter rowGroupFilter = new ParquetMetricsRowGroupFilter(VARIANT_SCHEMA, notNull("variant_field"), true); - boolean shouldRead = rowGroupFilter.shouldRead(fileSchema, blockMetaData); - assertThat(shouldRead) + assertThat(rowGroupFilter.shouldRead(fileSchema, blockMetaData)) .as("Should read: variant notNull filters must be evaluated post scan even for all nulls") .isTrue(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java index cfb4980df6e4..2754e891a481 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/SparkTestHelperBase.java @@ -83,7 +83,8 @@ protected void assertEquals(String context, Object[] expectedRow, Object[] actua } else if (expectedValue instanceof VariantVal && actualValue instanceof VariantVal) { // Spark VariantVal comparison is based on raw byte[] comparison, which can fail // if Spark uses trailing null bytes. so, we compare their JSON representation instead. - assertThat((actualValue).toString()) + assertThat(actualValue) + .asString() .as("%s contents should match (VariantVal JSON)", context) .isEqualTo((expectedValue).toString()); } else if (expectedValue != ANY) {