-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Parquet variant array write #12847
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Parquet variant array write #12847
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,7 +25,11 @@ | |
| import java.util.Comparator; | ||
| import java.util.Deque; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Objects; | ||
| import java.util.Optional; | ||
| import java.util.function.Function; | ||
| import java.util.stream.Collectors; | ||
| import org.apache.iceberg.expressions.PathUtil; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
|
|
@@ -395,7 +399,30 @@ public Type object(VariantObject object, List<String> names, List<Type> typedVal | |
|
|
||
| @Override | ||
| public Type array(VariantArray array, List<Type> elementResults) { | ||
| return null; | ||
| if (elementResults.isEmpty()) { | ||
| return null; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this not support shredding an array of encoded variants? This could be a list with only an inner |
||
| } | ||
|
|
||
| // Choose most common type as shredding type and build 3-level list | ||
| Type defaultTYpe = elementResults.get(0); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typo: |
||
| Type shredType = | ||
| elementResults.stream() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is okay, but it seems strange to me to rely on |
||
| .filter(Objects::nonNull) | ||
| .collect(Collectors.groupingBy(Function.identity(), Collectors.counting())) | ||
| .entrySet() | ||
| .stream() | ||
| .max(Map.Entry.comparingByValue()) | ||
| .map(Map.Entry::getKey) | ||
| .orElse(defaultTYpe); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the list is not empty, then this should never be used. Why default it? |
||
|
|
||
| return list(shredType); | ||
| } | ||
|
|
||
| private static GroupType list(Type shreddedType) { | ||
| GroupType elementType = field("element", shreddedType); | ||
| checkField(elementType); | ||
|
|
||
| return Types.optionalList().element(elementType).named("typed_value"); | ||
| } | ||
|
|
||
| @Override | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,7 @@ | |
| import org.apache.iceberg.variants.PhysicalType; | ||
| import org.apache.iceberg.variants.ShreddedObject; | ||
| import org.apache.iceberg.variants.Variant; | ||
| import org.apache.iceberg.variants.VariantArray; | ||
| import org.apache.iceberg.variants.VariantMetadata; | ||
| import org.apache.iceberg.variants.VariantObject; | ||
| import org.apache.iceberg.variants.VariantValue; | ||
|
|
@@ -98,6 +99,17 @@ static ParquetValueWriter<VariantValue> objects( | |
| builder.build()); | ||
| } | ||
|
|
||
| @SuppressWarnings("unchecked") | ||
| public static ParquetValueWriter<VariantValue> array( | ||
rdblue marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| int repeatedDefinitionLevel, | ||
| int repeatedRepetitionLevel, | ||
| ParquetValueWriter<?> elementWriter) { | ||
| return new ArrayWriter( | ||
| repeatedDefinitionLevel, | ||
| repeatedRepetitionLevel, | ||
| (ParquetValueWriter<VariantValue>) elementWriter); | ||
| } | ||
|
|
||
| private static class VariantWriter implements ParquetValueWriter<Variant> { | ||
| private final ParquetValueWriter<VariantMetadata> metadataWriter; | ||
| private final ParquetValueWriter<VariantValue> valueWriter; | ||
|
|
@@ -360,6 +372,55 @@ public void setColumnStore(ColumnWriteStore columnStore) { | |
| } | ||
| } | ||
|
|
||
| private static class ArrayWriter implements TypedWriter { | ||
| private final int definitionLevel; | ||
| private final int repetitionLevel; | ||
| private final ParquetValueWriter<VariantValue> writer; | ||
| private final List<TripleWriter<?>> children; | ||
|
|
||
| protected ArrayWriter( | ||
| int definitionLevel, int repetitionLevel, ParquetValueWriter<VariantValue> writer) { | ||
| this.definitionLevel = definitionLevel; | ||
| this.repetitionLevel = repetitionLevel; | ||
| this.writer = writer; | ||
| this.children = writer.columns(); | ||
| } | ||
|
|
||
| @Override | ||
| public Set<PhysicalType> types() { | ||
| return Set.of(PhysicalType.ARRAY); | ||
| } | ||
|
|
||
| @Override | ||
| public void write(int parentRepetition, VariantValue value) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks correct to me. |
||
| VariantArray arr = value.asArray(); | ||
| if (arr.numElements() == 0) { | ||
| writeNull(writer, parentRepetition, definitionLevel); | ||
| } else { | ||
| for (int i = 0; i < arr.numElements(); i++) { | ||
| VariantValue element = arr.get(i); | ||
|
|
||
| int rl = repetitionLevel; | ||
| if (i == 0) { | ||
| rl = parentRepetition; | ||
| } | ||
|
|
||
| writer.write(rl, element); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public List<TripleWriter<?>> columns() { | ||
| return children; | ||
| } | ||
|
|
||
| @Override | ||
| public void setColumnStore(ColumnWriteStore columnStore) { | ||
| writer.setColumnStore(columnStore); | ||
| } | ||
| } | ||
|
|
||
| private static void writeNull( | ||
| ParquetValueWriter<?> writer, int repetitionLevel, int definitionLevel) { | ||
| for (TripleWriter<?> column : writer.columns()) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,10 +41,13 @@ | |
| import org.apache.iceberg.relocated.com.google.common.collect.Iterables; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.types.Types; | ||
| import org.apache.iceberg.variants.ValueArray; | ||
| import org.apache.iceberg.variants.Variant; | ||
| import org.apache.iceberg.variants.VariantArray; | ||
| import org.apache.iceberg.variants.VariantMetadata; | ||
| import org.apache.iceberg.variants.VariantObject; | ||
| import org.apache.iceberg.variants.VariantTestUtil; | ||
| import org.apache.iceberg.variants.VariantValue; | ||
| import org.apache.iceberg.variants.Variants; | ||
| import org.junit.jupiter.params.ParameterizedTest; | ||
| import org.junit.jupiter.params.provider.FieldSource; | ||
|
|
@@ -73,6 +76,12 @@ public class TestVariantWriters { | |
| "c", Variants.of("string"))); | ||
| private static final ByteBuffer EMPTY_OBJECT_BUFFER = | ||
| VariantTestUtil.createObject(TEST_METADATA_BUFFER, ImmutableMap.of()); | ||
| private static final ByteBuffer ARRAY_IN_OBJECT_BUFFER = | ||
| VariantTestUtil.createObject( | ||
| TEST_METADATA_BUFFER, | ||
| ImmutableMap.of( | ||
| "a", Variants.of(123456789), | ||
| "c", array(Variants.of("string"), Variants.of("iceberg")))); | ||
|
|
||
| private static final VariantMetadata EMPTY_METADATA = | ||
| Variants.metadata(VariantTestUtil.emptyMetadata()); | ||
|
|
@@ -83,6 +92,45 @@ public class TestVariantWriters { | |
| (VariantObject) Variants.value(TEST_METADATA, SIMILAR_OBJECT_BUFFER); | ||
| private static final VariantObject EMPTY_OBJECT = | ||
| (VariantObject) Variants.value(TEST_METADATA, EMPTY_OBJECT_BUFFER); | ||
| private static final VariantObject ARRAY_IN_OBJECT = | ||
| (VariantObject) Variants.value(TEST_METADATA, ARRAY_IN_OBJECT_BUFFER); | ||
|
|
||
| private static final ByteBuffer EMPTY_ARRAY_BUFFER = VariantTestUtil.createArray(); | ||
| private static final ByteBuffer TEST_ARRAY_BUFFER = | ||
| VariantTestUtil.createArray(Variants.of("iceberg"), Variants.of("string")); | ||
| private static final ByteBuffer MIXED_TYPE_ARRAY_BUFFER = | ||
| VariantTestUtil.createArray(Variants.of("iceberg"), Variants.of("string"), Variants.of(34)); | ||
| private static final ByteBuffer NESTED_ARRAY_BUFFER = | ||
| VariantTestUtil.createArray( | ||
| array(Variants.of("string"), Variants.of("iceberg")), | ||
| array(Variants.of("string"), Variants.of("iceberg"))); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you use different string values for the second array? |
||
| private static final ByteBuffer MIXED_NESTED_ARRAY_BUFFER = | ||
| VariantTestUtil.createArray( | ||
| array(Variants.of("string"), Variants.of("iceberg"), Variants.of(34)), | ||
| array(Variants.of(34), Variants.ofNull()), | ||
| array(), | ||
| array(Variants.of("string"), Variants.of("iceberg")), | ||
| Variants.of(34)); | ||
| private static final ByteBuffer OBJECT_IN_ARRAY_BUFFER = | ||
| VariantTestUtil.createArray(SIMILAR_OBJECT, SIMILAR_OBJECT); | ||
| private static final ByteBuffer MIXED_OBJECT_IN_ARRAY_BUFFER = | ||
| VariantTestUtil.createArray( | ||
| SIMILAR_OBJECT, SIMILAR_OBJECT, Variants.of("iceberg"), Variants.of(34)); | ||
|
|
||
| private static final VariantArray EMPTY_ARRAY = | ||
| (VariantArray) Variants.value(EMPTY_METADATA, EMPTY_ARRAY_BUFFER); | ||
| private static final VariantArray TEST_ARRAY = | ||
| (VariantArray) Variants.value(EMPTY_METADATA, TEST_ARRAY_BUFFER); | ||
| private static final VariantArray MIXED_TYPE_ARRAY = | ||
| (VariantArray) Variants.value(EMPTY_METADATA, MIXED_TYPE_ARRAY_BUFFER); | ||
| private static final VariantArray NESTED_ARRAY = | ||
| (VariantArray) Variants.value(EMPTY_METADATA, NESTED_ARRAY_BUFFER); | ||
| private static final VariantArray MIXED_NESTED_ARRAY = | ||
| (VariantArray) Variants.value(EMPTY_METADATA, MIXED_NESTED_ARRAY_BUFFER); | ||
| private static final VariantArray OBJECT_IN_ARRAY = | ||
| (VariantArray) Variants.value(TEST_METADATA, OBJECT_IN_ARRAY_BUFFER); | ||
| private static final VariantArray MIXED_OBJECT_IN_ARRAY = | ||
| (VariantArray) Variants.value(TEST_METADATA, MIXED_OBJECT_IN_ARRAY_BUFFER); | ||
|
|
||
| private static final Variant[] VARIANTS = | ||
| new Variant[] { | ||
|
|
@@ -104,6 +152,14 @@ public class TestVariantWriters { | |
| Variant.of(EMPTY_METADATA, EMPTY_OBJECT), | ||
| Variant.of(TEST_METADATA, TEST_OBJECT), | ||
| Variant.of(TEST_METADATA, SIMILAR_OBJECT), | ||
| Variant.of(TEST_METADATA, ARRAY_IN_OBJECT), | ||
| Variant.of(EMPTY_METADATA, EMPTY_ARRAY), | ||
| Variant.of(EMPTY_METADATA, TEST_ARRAY), | ||
| Variant.of(EMPTY_METADATA, MIXED_TYPE_ARRAY), | ||
| Variant.of(EMPTY_METADATA, NESTED_ARRAY), | ||
| Variant.of(EMPTY_METADATA, MIXED_NESTED_ARRAY), | ||
| Variant.of(TEST_METADATA, OBJECT_IN_ARRAY), | ||
| Variant.of(TEST_METADATA, MIXED_OBJECT_IN_ARRAY), | ||
| Variant.of(EMPTY_METADATA, Variants.ofIsoDate("2024-11-07")), | ||
| Variant.of(EMPTY_METADATA, Variants.ofIsoDate("1957-11-07")), | ||
| Variant.of(EMPTY_METADATA, Variants.ofIsoTimestamptz("2024-11-07T12:33:54.123456+00:00")), | ||
|
|
@@ -200,4 +256,13 @@ private static List<Record> writeAndRead( | |
| return Lists.newArrayList(reader); | ||
| } | ||
| } | ||
|
|
||
| private static ValueArray array(VariantValue... values) { | ||
| ValueArray arr = Variants.array(); | ||
| for (VariantValue value : values) { | ||
| arr.add(value); | ||
| } | ||
|
|
||
| return arr; | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.