Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ public static List<Object> parameters() {
optional(14, "all_nans", DoubleType.get()),
optional(15, "some_nans", FloatType.get()),
optional(16, "no_nans", DoubleType.get()),
optional(17, "some_double_nans", DoubleType.get()));
optional(17, "some_double_nans", DoubleType.get()),
optional(18, "uuid_col", Types.UUIDType.get()));

private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE =
Types.StructType.of(Types.NestedField.required(8, "_int_field", IntegerType.get()));
Expand All @@ -137,7 +138,8 @@ public static List<Object> parameters() {
optional(14, "_all_nans", Types.DoubleType.get()),
optional(15, "_some_nans", FloatType.get()),
optional(16, "_no_nans", Types.DoubleType.get()),
optional(17, "_some_double_nans", Types.DoubleType.get()));
optional(17, "_some_double_nans", Types.DoubleType.get()),
optional(18, "_uuid_col", Types.UUIDType.get()));

private static final Schema VARIANT_SCHEMA =
new Schema(
Expand All @@ -157,6 +159,9 @@ public static List<Object> parameters() {
private static final int INT_MIN_VALUE = 30;
private static final int INT_MAX_VALUE = 79;

private static final UUID UUID_WITH_ZEROS =
UUID.fromString("00000000-0000-0000-0000-000000000000");

private File orcFile = null;
private MessageType parquetSchema = null;
private BlockMetaData rowGroupMetadata = null;
Expand Down Expand Up @@ -210,6 +215,7 @@ public void createOrcInputFile() throws IOException {
GenericRecord structNotNull = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE);
structNotNull.setField("_int_field", INT_MIN_VALUE + i);
record.setField("_struct_not_null", structNotNull); // struct with int
record.setField("_uuid_col", (i % 2 == 0) ? UUID_WITH_ZEROS : null);

appender.add(record);
}
Expand Down Expand Up @@ -248,6 +254,8 @@ private void createParquetInputFile() throws IOException {
GenericRecord structNotNull = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE);
structNotNull.setField("_int_field", INT_MIN_VALUE + i);
builder.setField("_struct_not_null", structNotNull); // struct with int
builder.setField("_uuid_col", (i % 2 == 0) ? UUID_WITH_ZEROS : null);

records.add(builder);
}

Expand Down Expand Up @@ -1063,6 +1071,59 @@ public void testVariantFieldAllNullsNotNull() throws IOException {
}
}

@TestTemplate
public void testUUID() {
assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET);

UUID nonExistentUuid = UUID.fromString("99999999-9999-9999-9999-999999999999");

boolean shouldRead = shouldRead(equal("uuid_col", UUID_WITH_ZEROS));
assertThat(shouldRead).as("Should read: column contains the value").isTrue();

shouldRead = shouldRead(equal("uuid_col", nonExistentUuid));
assertThat(shouldRead).as("Should skip: column does not contain the value").isFalse();

shouldRead = shouldRead(notEqual("uuid_col", UUID_WITH_ZEROS));
assertThat(shouldRead).as("Should read: column contains nulls").isTrue();

shouldRead = shouldRead(notEqual("uuid_col", nonExistentUuid));
assertThat(shouldRead).as("Should read: column contains non-matching values").isTrue();

shouldRead = shouldRead(lessThan("uuid_col", UUID_WITH_ZEROS));
assertThat(shouldRead).as("Should skip: no values lower").isFalse();

shouldRead = shouldRead(lessThanOrEqual("uuid_col", UUID_WITH_ZEROS));
assertThat(shouldRead).as("Should read: column contains the value").isTrue();

shouldRead = shouldRead(greaterThan("uuid_col", UUID_WITH_ZEROS));
assertThat(shouldRead).as("Should skip: no values greater").isFalse();

shouldRead = shouldRead(greaterThanOrEqual("uuid_col", UUID_WITH_ZEROS));
assertThat(shouldRead).as("Should read: column contains the value").isTrue();

shouldRead = shouldRead(isNull("uuid_col"));
assertThat(shouldRead).as("Should read: column contains null values").isTrue();

shouldRead = shouldRead(notNull("uuid_col"));
assertThat(shouldRead).as("Should read: column contains non-null values").isTrue();

shouldRead = shouldRead(in("uuid_col", UUID_WITH_ZEROS, nonExistentUuid));
assertThat(shouldRead).as("Should read: column contains one of the values").isTrue();

shouldRead = shouldRead(in("uuid_col", nonExistentUuid));
assertThat(shouldRead).as("Should skip: column contains none of the values").isFalse();

shouldRead = shouldRead(notIn("uuid_col", nonExistentUuid));
assertThat(shouldRead)
.as("Should read: column contains values not in the exclusion list")
.isTrue();

shouldRead = shouldRead(notIn("uuid_col", UUID_WITH_ZEROS));
assertThat(shouldRead)
.as("Should read: column contains null values not in the exclusion list")
.isTrue();
}

private boolean shouldRead(Expression expression) {
return shouldRead(expression, true);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ static Function<Object, Object> converterFromParquet(
} else if (icebergType.typeId() == Type.TypeID.DOUBLE
&& parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.FLOAT) {
return value -> ((Float) fromParquet.apply(value)).doubleValue();
} else if (icebergType.typeId() == Type.TypeID.UUID) {
return binary -> UUIDUtil.convert(((Binary) binary).toByteBuffer());
Comment on lines +86 to +87
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like an odd place to apply this conversion since the rows above are more about schema evolution. However, looking at it a bit closer, I think it makes sense. Other logical types, such as TimestampLiteral store the primitive type internally (long), while the UUIDLiteral keeps a UUID rather than bytes.

This will just compare the bytes using an unsigned lexicographical binary comparator.

}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ public class TestDictionaryRowGroupFilter {
14,
"decimal_fixed",
DecimalType.of(20, 10)), // >18 precision to enforce FIXED_LEN_BYTE_ARRAY
optional(15, "_nans_and_nulls", DoubleType.get()));
optional(15, "_nans_and_nulls", DoubleType.get()),
optional(16, "uuid_col", Types.UUIDType.get()));

private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE =
Types.StructType.of(Types.NestedField.required(9, "_int_field", IntegerType.get()));
Expand All @@ -133,7 +134,8 @@ public class TestDictionaryRowGroupFilter {
14,
"_decimal_fixed",
DecimalType.of(20, 10)), // >18 precision to enforce FIXED_LEN_BYTE_ARRAY
optional(15, "_nans_and_nulls", DoubleType.get()));
optional(15, "_nans_and_nulls", DoubleType.get()),
optional(16, "_uuid_col", Types.UUIDType.get()));

private static final String TOO_LONG_FOR_STATS;

Expand All @@ -153,6 +155,9 @@ public class TestDictionaryRowGroupFilter {
.subtract(DECIMAL_MIN_VALUE)
.divide(new BigDecimal(INT_MAX_VALUE - INT_MIN_VALUE), RoundingMode.HALF_UP);

private static final UUID UUID_WITH_ZEROS =
UUID.fromString("00000000-0000-0000-0000-000000000000");

private MessageType parquetSchema = null;
private BlockMetaData rowGroupMetadata = null;
private DictionaryPageReadStore dictionaryStore = null;
Expand Down Expand Up @@ -203,6 +208,8 @@ public void createInputFile() throws IOException {
structNotNull.put("_int_field", INT_MIN_VALUE + i);
builder.set("_struct_not_null", structNotNull); // struct with int

builder.set("_uuid_col", (i % 2 == 0) ? UUID_WITH_ZEROS : null);

appender.add(builder.build());
}
}
Expand Down Expand Up @@ -1267,6 +1274,89 @@ public void testTransformFilter() {
.isTrue();
}

@TestTemplate
public void testUUID() {
assumeThat(getColumnForName(rowGroupMetadata, "_uuid_col").getEncodings())
.contains(Encoding.RLE_DICTIONARY);

UUID nonExistentUuid = UUID.fromString("99999999-9999-9999-9999-999999999999");

boolean shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, equal("uuid_col", UUID_WITH_ZEROS))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should read: column contains the value").isTrue();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, equal("uuid_col", nonExistentUuid))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should skip: column does not contain the value").isFalse();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("uuid_col", UUID_WITH_ZEROS))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should read: column contains nulls").isTrue();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, notEqual("uuid_col", nonExistentUuid))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should read: column contains non-matching values").isTrue();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, lessThan("uuid_col", UUID_WITH_ZEROS))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should skip: no uuid less than lower bound").isFalse();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, lessThanOrEqual("uuid_col", UUID_WITH_ZEROS))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should read: one possible uuid").isTrue();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThan("uuid_col", UUID_WITH_ZEROS))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should skip: no uuid greater than upper bound").isFalse();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, greaterThanOrEqual("uuid_col", UUID_WITH_ZEROS))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should read: one possible uuid").isTrue();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, isNull("uuid_col"))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should read: column contains null values").isTrue();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, notNull("uuid_col"))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should read: column contains non-null values").isTrue();

shouldRead =
new ParquetDictionaryRowGroupFilter(
SCHEMA, in("uuid_col", UUID_WITH_ZEROS, nonExistentUuid))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should read: column contains one of the values").isTrue();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, in("uuid_col", nonExistentUuid))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead).as("Should skip: column contains none of the values").isFalse();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("uuid_col", nonExistentUuid))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead)
.as("Should read: column contains values not in the exclusion list")
.isTrue();

shouldRead =
new ParquetDictionaryRowGroupFilter(SCHEMA, notIn("uuid_col", UUID_WITH_ZEROS))
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
assertThat(shouldRead)
.as("Should read: column contains null values not in the exclusion list")
.isTrue();
}

private ColumnChunkMetaData getColumnForName(BlockMetaData rowGroup, String columnName) {
ColumnPath columnPath = ColumnPath.fromDotString(columnName);
for (ColumnChunkMetaData column : rowGroup.getColumns()) {
Expand Down