-
Notifications
You must be signed in to change notification settings - Fork 3k
Parquet variant array write #12847
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Parquet variant array write #12847
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,7 @@ | |
| import org.apache.iceberg.variants.PhysicalType; | ||
| import org.apache.iceberg.variants.ShreddedObject; | ||
| import org.apache.iceberg.variants.Variant; | ||
| import org.apache.iceberg.variants.VariantArray; | ||
| import org.apache.iceberg.variants.VariantMetadata; | ||
| import org.apache.iceberg.variants.VariantObject; | ||
| import org.apache.iceberg.variants.VariantValue; | ||
|
|
@@ -98,6 +99,26 @@ static ParquetValueWriter<VariantValue> objects( | |
| builder.build()); | ||
| } | ||
|
|
||
| public static ParquetValueWriter<?> array( | ||
| int valueDefinitionLevel, | ||
| ParquetValueWriter<?> valueWriter, | ||
| int typedDefinitionLevel, | ||
| int repeatedDefinitionLevel, | ||
| int repeatedRepetitionLevel, | ||
| ParquetValueWriter<?> elementWriter) { | ||
| ArrayWriter typedWriter = | ||
| new ArrayWriter( | ||
aihuaxu marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| repeatedDefinitionLevel, | ||
| repeatedRepetitionLevel, | ||
| (ParquetValueWriter<VariantValue>) elementWriter); | ||
|
|
||
| return new ArrayValueWriter( | ||
| valueDefinitionLevel, | ||
| (ParquetValueWriter<VariantValue>) valueWriter, | ||
| typedDefinitionLevel, | ||
| typedWriter); | ||
| } | ||
|
|
||
| private static class VariantWriter implements ParquetValueWriter<Variant> { | ||
| private final ParquetValueWriter<VariantMetadata> metadataWriter; | ||
| private final ParquetValueWriter<VariantValue> valueWriter; | ||
|
|
@@ -360,6 +381,92 @@ public void setColumnStore(ColumnWriteStore columnStore) { | |
| } | ||
| } | ||
|
|
||
| private static class ArrayValueWriter implements ParquetValueWriter<VariantValue> { | ||
| private final int valueDefinitionLevel; | ||
| private final ParquetValueWriter<VariantValue> valueWriter; | ||
| private final int typedDefinitionLevel; | ||
| private final ArrayWriter typedWriter; | ||
| private final List<TripleWriter<?>> children; | ||
|
|
||
| private ArrayValueWriter( | ||
aihuaxu marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| int valueDefinitionLevel, | ||
| ParquetValueWriter<VariantValue> valueWriter, | ||
| int typedDefinitionLevel, | ||
| ArrayWriter typedWriter) { | ||
| this.valueDefinitionLevel = valueDefinitionLevel; | ||
| this.valueWriter = valueWriter; | ||
| this.typedDefinitionLevel = typedDefinitionLevel; | ||
| this.typedWriter = typedWriter; | ||
| this.children = children(valueWriter, typedWriter); | ||
| } | ||
|
|
||
| @Override | ||
| public void write(int repetitionLevel, VariantValue value) { | ||
| if (value.type() == PhysicalType.ARRAY) { | ||
| typedWriter.write(repetitionLevel, value); | ||
| writeNull(valueWriter, repetitionLevel, valueDefinitionLevel); | ||
| } else { | ||
| valueWriter.write(repetitionLevel, value); | ||
| writeNull(typedWriter, repetitionLevel, typedDefinitionLevel); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public List<TripleWriter<?>> columns() { | ||
| return children; | ||
| } | ||
|
|
||
| @Override | ||
| public void setColumnStore(ColumnWriteStore columnStore) { | ||
| valueWriter.setColumnStore(columnStore); | ||
| typedWriter.setColumnStore(columnStore); | ||
| } | ||
| } | ||
|
|
||
| private static class ArrayWriter implements ParquetValueWriter<VariantValue> { | ||
| private final int definitionLevel; | ||
| private final int repetitionLevel; | ||
| private final ParquetValueWriter<VariantValue> writer; | ||
| private final List<TripleWriter<?>> children; | ||
|
|
||
| private ArrayWriter( | ||
| int definitionLevel, int repetitionLevel, ParquetValueWriter<VariantValue> writer) { | ||
| this.definitionLevel = definitionLevel; | ||
| this.repetitionLevel = repetitionLevel; | ||
| this.writer = writer; | ||
| this.children = writer.columns(); | ||
| } | ||
|
|
||
| @Override | ||
| public void write(int parentRepetition, VariantValue value) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks correct to me. |
||
| VariantArray arr = value.asArray(); | ||
| if (arr.numElements() == 0) { | ||
| writeNull(writer, parentRepetition, definitionLevel); | ||
| } else { | ||
| for (int i = 0; i < arr.numElements(); i++) { | ||
| VariantValue element = arr.get(i); | ||
|
|
||
| int rl = repetitionLevel; | ||
| if (i == 0) { | ||
| rl = parentRepetition; | ||
| } | ||
|
|
||
| writer.write(rl, element); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public List<TripleWriter<?>> columns() { | ||
| return children; | ||
| } | ||
|
|
||
| @Override | ||
| public void setColumnStore(ColumnWriteStore columnStore) { | ||
| writer.setColumnStore(columnStore); | ||
| } | ||
| } | ||
|
|
||
| private static void writeNull( | ||
| ParquetValueWriter<?> writer, int repetitionLevel, int definitionLevel) { | ||
| for (TripleWriter<?> column : writer.columns()) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,10 +41,13 @@ | |
| import org.apache.iceberg.relocated.com.google.common.collect.Iterables; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.types.Types; | ||
| import org.apache.iceberg.variants.ValueArray; | ||
| import org.apache.iceberg.variants.Variant; | ||
| import org.apache.iceberg.variants.VariantArray; | ||
| import org.apache.iceberg.variants.VariantMetadata; | ||
| import org.apache.iceberg.variants.VariantObject; | ||
| import org.apache.iceberg.variants.VariantTestUtil; | ||
| import org.apache.iceberg.variants.VariantValue; | ||
| import org.apache.iceberg.variants.Variants; | ||
| import org.junit.jupiter.params.ParameterizedTest; | ||
| import org.junit.jupiter.params.provider.FieldSource; | ||
|
|
@@ -73,6 +76,12 @@ public class TestVariantWriters { | |
| "c", Variants.of("string"))); | ||
| private static final ByteBuffer EMPTY_OBJECT_BUFFER = | ||
| VariantTestUtil.createObject(TEST_METADATA_BUFFER, ImmutableMap.of()); | ||
| private static final ByteBuffer ARRAY_IN_OBJECT_BUFFER = | ||
| VariantTestUtil.createObject( | ||
| TEST_METADATA_BUFFER, | ||
| ImmutableMap.of( | ||
| "a", Variants.of(123456789), | ||
| "c", array(Variants.of("string"), Variants.of("iceberg")))); | ||
|
|
||
| private static final VariantMetadata EMPTY_METADATA = | ||
| Variants.metadata(VariantTestUtil.emptyMetadata()); | ||
|
|
@@ -83,6 +92,30 @@ public class TestVariantWriters { | |
| (VariantObject) Variants.value(TEST_METADATA, SIMILAR_OBJECT_BUFFER); | ||
| private static final VariantObject EMPTY_OBJECT = | ||
| (VariantObject) Variants.value(TEST_METADATA, EMPTY_OBJECT_BUFFER); | ||
| private static final VariantObject ARRAY_IN_OBJECT = | ||
| (VariantObject) Variants.value(TEST_METADATA, ARRAY_IN_OBJECT_BUFFER); | ||
|
|
||
| private static final ByteBuffer EMPTY_ARRAY_BUFFER = VariantTestUtil.createArray(); | ||
| private static final ByteBuffer TEST_ARRAY_BUFFER = | ||
| VariantTestUtil.createArray(Variants.of("iceberg"), Variants.of("string"), Variants.of(34)); | ||
| private static final ByteBuffer NESTED_ARRAY_BUFFER = | ||
| VariantTestUtil.createArray( | ||
| array(Variants.of("string"), Variants.of("iceberg"), Variants.of(34)), | ||
| array(Variants.of(34), Variants.ofNull()), | ||
| array(), | ||
| array(Variants.of("string"), Variants.of("iceberg")), | ||
| Variants.of(34)); | ||
| private static final ByteBuffer OBJECT_IN_ARRAY_BUFFER = | ||
| VariantTestUtil.createArray(TEST_OBJECT, EMPTY_OBJECT, SIMILAR_OBJECT); | ||
|
|
||
| private static final VariantArray EMPTY_ARRAY = | ||
| (VariantArray) Variants.value(EMPTY_METADATA, EMPTY_ARRAY_BUFFER); | ||
| private static final VariantArray TEST_ARRAY = | ||
| (VariantArray) Variants.value(EMPTY_METADATA, TEST_ARRAY_BUFFER); | ||
| private static final VariantArray TEST_NESTED_ARRAY = | ||
| (VariantArray) Variants.value(EMPTY_METADATA, NESTED_ARRAY_BUFFER); | ||
| private static final VariantArray TEST_OBJECT_IN_ARRAY = | ||
| (VariantArray) Variants.value(TEST_METADATA, OBJECT_IN_ARRAY_BUFFER); | ||
|
|
||
| private static final Variant[] VARIANTS = | ||
| new Variant[] { | ||
|
|
@@ -104,6 +137,11 @@ public class TestVariantWriters { | |
| Variant.of(EMPTY_METADATA, EMPTY_OBJECT), | ||
| Variant.of(TEST_METADATA, TEST_OBJECT), | ||
| Variant.of(TEST_METADATA, SIMILAR_OBJECT), | ||
| Variant.of(TEST_METADATA, ARRAY_IN_OBJECT), | ||
| Variant.of(EMPTY_METADATA, EMPTY_ARRAY), | ||
| Variant.of(EMPTY_METADATA, TEST_ARRAY), | ||
| Variant.of(EMPTY_METADATA, TEST_NESTED_ARRAY), | ||
| Variant.of(TEST_METADATA, TEST_OBJECT_IN_ARRAY), | ||
|
||
| Variant.of(EMPTY_METADATA, Variants.ofIsoDate("2024-11-07")), | ||
| Variant.of(EMPTY_METADATA, Variants.ofIsoDate("1957-11-07")), | ||
| Variant.of(EMPTY_METADATA, Variants.ofIsoTimestamptz("2024-11-07T12:33:54.123456+00:00")), | ||
|
|
@@ -200,4 +238,13 @@ private static List<Record> writeAndRead( | |
| return Lists.newArrayList(reader); | ||
| } | ||
| } | ||
|
|
||
| private static ValueArray array(VariantValue... values) { | ||
| ValueArray arr = Variants.array(); | ||
| for (VariantValue value : values) { | ||
| arr.add(value); | ||
| } | ||
|
|
||
| return arr; | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.