diff --git a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java index e12015d5eb73..c871c25c931d 100644 --- a/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java +++ b/data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java @@ -119,7 +119,8 @@ public static List parameters() { optional(14, "all_nans", DoubleType.get()), optional(15, "some_nans", FloatType.get()), optional(16, "no_nans", DoubleType.get()), - optional(17, "some_double_nans", DoubleType.get())); + optional(17, "some_double_nans", DoubleType.get()), + optional(18, "uuid_col", Types.UUIDType.get())); private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE = Types.StructType.of(Types.NestedField.required(8, "_int_field", IntegerType.get())); @@ -137,7 +138,8 @@ public static List parameters() { optional(14, "_all_nans", Types.DoubleType.get()), optional(15, "_some_nans", FloatType.get()), optional(16, "_no_nans", Types.DoubleType.get()), - optional(17, "_some_double_nans", Types.DoubleType.get())); + optional(17, "_some_double_nans", Types.DoubleType.get()), + optional(18, "_uuid_col", Types.UUIDType.get())); private static final Schema VARIANT_SCHEMA = new Schema( @@ -157,6 +159,9 @@ public static List parameters() { private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; + private static final UUID UUID_WITH_ZEROS = + UUID.fromString("00000000-0000-0000-0000-000000000000"); + private File orcFile = null; private MessageType parquetSchema = null; private BlockMetaData rowGroupMetadata = null; @@ -210,6 +215,7 @@ public void createOrcInputFile() throws IOException { GenericRecord structNotNull = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE); structNotNull.setField("_int_field", INT_MIN_VALUE + i); record.setField("_struct_not_null", structNotNull); // struct with int + record.setField("_uuid_col", (i % 2 == 0) ? UUID_WITH_ZEROS : null); appender.add(record); } @@ -248,6 +254,8 @@ private void createParquetInputFile() throws IOException { GenericRecord structNotNull = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE); structNotNull.setField("_int_field", INT_MIN_VALUE + i); builder.setField("_struct_not_null", structNotNull); // struct with int + builder.setField("_uuid_col", (i % 2 == 0) ? UUID_WITH_ZEROS : null); + records.add(builder); } @@ -1063,6 +1071,59 @@ public void testVariantFieldAllNullsNotNull() throws IOException { } } + @TestTemplate + public void testUUID() { + assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET); + + UUID nonExistentUuid = UUID.fromString("99999999-9999-9999-9999-999999999999"); + + boolean shouldRead = shouldRead(equal("uuid_col", UUID_WITH_ZEROS)); + assertThat(shouldRead).as("Should read: column contains the value").isTrue(); + + shouldRead = shouldRead(equal("uuid_col", nonExistentUuid)); + assertThat(shouldRead).as("Should skip: column does not contain the value").isFalse(); + + shouldRead = shouldRead(notEqual("uuid_col", UUID_WITH_ZEROS)); + assertThat(shouldRead).as("Should read: column contains nulls").isTrue(); + + shouldRead = shouldRead(notEqual("uuid_col", nonExistentUuid)); + assertThat(shouldRead).as("Should read: column contains non-matching values").isTrue(); + + shouldRead = shouldRead(lessThan("uuid_col", UUID_WITH_ZEROS)); + assertThat(shouldRead).as("Should skip: no values lower").isFalse(); + + shouldRead = shouldRead(lessThanOrEqual("uuid_col", UUID_WITH_ZEROS)); + assertThat(shouldRead).as("Should read: column contains the value").isTrue(); + + shouldRead = shouldRead(greaterThan("uuid_col", UUID_WITH_ZEROS)); + assertThat(shouldRead).as("Should skip: no values greater").isFalse(); + + shouldRead = shouldRead(greaterThanOrEqual("uuid_col", UUID_WITH_ZEROS)); + assertThat(shouldRead).as("Should read: column contains the value").isTrue(); + + shouldRead = shouldRead(isNull("uuid_col")); + assertThat(shouldRead).as("Should read: column contains null values").isTrue(); + + shouldRead = shouldRead(notNull("uuid_col")); + assertThat(shouldRead).as("Should read: column contains non-null values").isTrue(); + + shouldRead = shouldRead(in("uuid_col", UUID_WITH_ZEROS, nonExistentUuid)); + assertThat(shouldRead).as("Should read: column contains one of the values").isTrue(); + + shouldRead = shouldRead(in("uuid_col", nonExistentUuid)); + assertThat(shouldRead).as("Should skip: column contains none of the values").isFalse(); + + shouldRead = shouldRead(notIn("uuid_col", nonExistentUuid)); + assertThat(shouldRead) + .as("Should read: column contains values not in the exclusion list") + .isTrue(); + + shouldRead = shouldRead(notIn("uuid_col", UUID_WITH_ZEROS)); + assertThat(shouldRead) + .as("Should read: column contains null values not in the exclusion list") + .isTrue(); + } + private boolean shouldRead(Expression expression) { return shouldRead(expression, true); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java index 3a70198a1a57..1e5ed1fb9b87 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java @@ -83,6 +83,8 @@ static Function converterFromParquet( } else if (icebergType.typeId() == Type.TypeID.DOUBLE && parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.FLOAT) { return value -> ((Float) fromParquet.apply(value)).doubleValue(); + } else if (icebergType.typeId() == Type.TypeID.UUID) { + return binary -> UUIDUtil.convert(((Binary) binary).toByteBuffer()); } } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java index ac6e41347d47..48cc2b0992a5 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java @@ -111,7 +111,8 @@ public class TestDictionaryRowGroupFilter { 14, "decimal_fixed", DecimalType.of(20, 10)), // >18 precision to enforce FIXED_LEN_BYTE_ARRAY - optional(15, "_nans_and_nulls", DoubleType.get())); + optional(15, "_nans_and_nulls", DoubleType.get()), + optional(16, "uuid_col", Types.UUIDType.get())); private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE = Types.StructType.of(Types.NestedField.required(9, "_int_field", IntegerType.get())); @@ -133,7 +134,8 @@ public class TestDictionaryRowGroupFilter { 14, "_decimal_fixed", DecimalType.of(20, 10)), // >18 precision to enforce FIXED_LEN_BYTE_ARRAY - optional(15, "_nans_and_nulls", DoubleType.get())); + optional(15, "_nans_and_nulls", DoubleType.get()), + optional(16, "_uuid_col", Types.UUIDType.get())); private static final String TOO_LONG_FOR_STATS; @@ -153,6 +155,9 @@ public class TestDictionaryRowGroupFilter { .subtract(DECIMAL_MIN_VALUE) .divide(new BigDecimal(INT_MAX_VALUE - INT_MIN_VALUE), RoundingMode.HALF_UP); + private static final UUID UUID_WITH_ZEROS = + UUID.fromString("00000000-0000-0000-0000-000000000000"); + private MessageType parquetSchema = null; private BlockMetaData rowGroupMetadata = null; private DictionaryPageReadStore dictionaryStore = null; @@ -203,6 +208,8 @@ public void createInputFile() throws IOException { structNotNull.put("_int_field", INT_MIN_VALUE + i); builder.set("_struct_not_null", structNotNull); // struct with int + builder.set("_uuid_col", (i % 2 == 0) ? UUID_WITH_ZEROS : null); + appender.add(builder.build()); } } @@ -1267,6 +1274,89 @@ public void testTransformFilter() { .isTrue(); } + @TestTemplate + public void testUUID() { + assumeThat(getColumnForName(rowGroupMetadata, "_uuid_col").getEncodings()) + .contains(Encoding.RLE_DICTIONARY); + + UUID nonExistentUuid = UUID.fromString("99999999-9999-9999-9999-999999999999"); + + boolean shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, equal("uuid_col", UUID_WITH_ZEROS)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should read: column contains the value").isTrue(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, equal("uuid_col", nonExistentUuid)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should skip: column does not contain the value").isFalse(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("uuid_col", UUID_WITH_ZEROS)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should read: column contains nulls").isTrue(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("uuid_col", nonExistentUuid)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should read: column contains non-matching values").isTrue(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("uuid_col", UUID_WITH_ZEROS)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should skip: no uuid less than lower bound").isFalse(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("uuid_col", UUID_WITH_ZEROS)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("uuid_col", UUID_WITH_ZEROS)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should skip: no uuid greater than upper bound").isFalse(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("uuid_col", UUID_WITH_ZEROS)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should read: one possible uuid").isTrue(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("uuid_col")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should read: column contains null values").isTrue(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("uuid_col")) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should read: column contains non-null values").isTrue(); + + shouldRead = + new ParquetDictionaryRowGroupFilter( + SCHEMA, in("uuid_col", UUID_WITH_ZEROS, nonExistentUuid)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should read: column contains one of the values").isTrue(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, in("uuid_col", nonExistentUuid)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead).as("Should skip: column contains none of the values").isFalse(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("uuid_col", nonExistentUuid)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead) + .as("Should read: column contains values not in the exclusion list") + .isTrue(); + + shouldRead = + new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("uuid_col", UUID_WITH_ZEROS)) + .shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore); + assertThat(shouldRead) + .as("Should read: column contains null values not in the exclusion list") + .isTrue(); + } + private ColumnChunkMetaData getColumnForName(BlockMetaData rowGroup, String columnName) { ColumnPath columnPath = ColumnPath.fromDotString(columnName); for (ColumnChunkMetaData column : rowGroup.getColumns()) {