-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Flink: add _row_id and _last_updated_sequence_number readers #14148
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a0372b4
01c8251
8cf2f8e
6356e37
d6a877c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,7 +34,6 @@ | |
| import org.apache.flink.table.data.RowData; | ||
| import org.apache.flink.table.data.StringData; | ||
| import org.apache.flink.table.data.TimestampData; | ||
| import org.apache.iceberg.MetadataColumns; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.parquet.ParquetSchemaUtil; | ||
| import org.apache.iceberg.parquet.ParquetValueReader; | ||
|
|
@@ -90,10 +89,13 @@ public ParquetValueReader<RowData> message( | |
| @SuppressWarnings("checkstyle:CyclomaticComplexity") | ||
| public ParquetValueReader<RowData> struct( | ||
| Types.StructType expected, GroupType struct, List<ParquetValueReader<?>> fieldReaders) { | ||
|
|
||
| if (null == expected) { | ||
| return new RowDataReader(ImmutableList.of()); | ||
| } | ||
|
|
||
| // match the expected struct's order | ||
| Map<Integer, ParquetValueReader<?>> readersById = Maps.newHashMap(); | ||
| Map<Integer, Type> typesById = Maps.newHashMap(); | ||
| Map<Integer, Integer> maxDefinitionLevelsById = Maps.newHashMap(); | ||
| List<Type> fields = struct.getFields(); | ||
| for (int i = 0; i < fields.size(); i += 1) { | ||
| Type fieldType = fields.get(i); | ||
|
|
@@ -102,51 +104,39 @@ public ParquetValueReader<RowData> struct( | |
| if (fieldType.getId() != null) { | ||
| int id = fieldType.getId().intValue(); | ||
| readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReaders.get(i))); | ||
| typesById.put(id, fieldType); | ||
| if (idToConstant.containsKey(id)) { | ||
| maxDefinitionLevelsById.put(id, fieldD); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| List<Types.NestedField> expectedFields = | ||
| expected != null ? expected.fields() : ImmutableList.of(); | ||
| int constantDefinitionLevel = type.getMaxDefinitionLevel(currentPath()); | ||
| List<Types.NestedField> expectedFields = expected.fields(); | ||
| List<ParquetValueReader<?>> reorderedFields = | ||
| Lists.newArrayListWithExpectedSize(expectedFields.size()); | ||
| // Defaulting to parent max definition level | ||
| int defaultMaxDefinitionLevel = type.getMaxDefinitionLevel(currentPath()); | ||
| for (Types.NestedField field : expectedFields) { | ||
| int id = field.fieldId(); | ||
| ParquetValueReader<?> reader = readersById.get(id); | ||
| if (idToConstant.containsKey(id)) { | ||
| // containsKey is used because the constant may be null | ||
| int fieldMaxDefinitionLevel = | ||
| maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel); | ||
| reorderedFields.add( | ||
| ParquetValueReaders.constant(idToConstant.get(id), fieldMaxDefinitionLevel)); | ||
| } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { | ||
| reorderedFields.add(ParquetValueReaders.position()); | ||
| } else if (id == MetadataColumns.IS_DELETED.fieldId()) { | ||
| reorderedFields.add(ParquetValueReaders.constant(false)); | ||
| } else if (reader != null) { | ||
| reorderedFields.add(reader); | ||
| } else if (field.initialDefault() != null) { | ||
| reorderedFields.add( | ||
| ParquetValueReaders.constant( | ||
| RowDataUtil.convertConstant(field.type(), field.initialDefault()), | ||
| maxDefinitionLevelsById.getOrDefault(id, defaultMaxDefinitionLevel))); | ||
| } else if (field.isOptional()) { | ||
| reorderedFields.add(ParquetValueReaders.nulls()); | ||
| } else { | ||
| throw new IllegalArgumentException( | ||
| String.format("Missing required field: %s", field.name())); | ||
| } | ||
| ParquetValueReader<?> reader = | ||
| ParquetValueReaders.replaceWithMetadataReader( | ||
| id, readersById.get(id), idToConstant, constantDefinitionLevel); | ||
| reorderedFields.add(defaultReader(field, reader, constantDefinitionLevel)); | ||
|
Comment on lines
-121
to
+120
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just curious, how did you decide to make this refactoring? You consolidate several branches of the if statement.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In #12836, the core layer integrated the same code between Spark and Core. I noticed that the code for Flink is also the same, so I reused it. |
||
| } | ||
|
|
||
| return new RowDataReader(reorderedFields); | ||
| } | ||
|
|
||
| private ParquetValueReader<?> defaultReader( | ||
| Types.NestedField field, ParquetValueReader<?> reader, int constantDL) { | ||
| if (reader != null) { | ||
| return reader; | ||
| } else if (field.initialDefault() != null) { | ||
| return ParquetValueReaders.constant( | ||
| RowDataUtil.convertConstant(field.type(), field.initialDefault()), constantDL); | ||
| } else if (field.isOptional()) { | ||
| return ParquetValueReaders.nulls(); | ||
| } | ||
|
|
||
| throw new IllegalArgumentException(String.format("Missing required field: %s", field.name())); | ||
| } | ||
|
|
||
| @Override | ||
| public ParquetValueReader<?> list( | ||
| Types.ListType expectedList, GroupType array, ParquetValueReader<?> elementReader) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,8 +42,10 @@ | |
| import org.apache.iceberg.data.parquet.GenericParquetWriter; | ||
| import org.apache.iceberg.flink.FlinkSchemaUtil; | ||
| import org.apache.iceberg.flink.TestHelpers; | ||
| import org.apache.iceberg.inmemory.InMemoryOutputFile; | ||
| import org.apache.iceberg.io.CloseableIterable; | ||
| import org.apache.iceberg.io.FileAppender; | ||
| import org.apache.iceberg.io.OutputFile; | ||
| import org.apache.iceberg.parquet.Parquet; | ||
| import org.apache.iceberg.parquet.ParquetValueReader; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
|
|
@@ -74,6 +76,11 @@ protected boolean supportsTimestampNanos() { | |
| return true; | ||
| } | ||
|
|
||
| @Override | ||
| protected boolean supportsRowLineage() { | ||
| return true; | ||
| } | ||
|
|
||
| @Test | ||
| public void testBuildReader() { | ||
| MessageType fileSchema = | ||
|
|
@@ -216,29 +223,31 @@ public void testTwoLevelList() throws IOException { | |
|
|
||
| private void writeAndValidate( | ||
| Iterable<Record> iterable, Schema writeSchema, Schema expectedSchema) throws IOException { | ||
| File testFile = File.createTempFile("junit", null, temp.toFile()); | ||
| assertThat(testFile.delete()).isTrue(); | ||
|
|
||
| OutputFile output = new InMemoryOutputFile(); | ||
| try (FileAppender<Record> writer = | ||
| Parquet.write(Files.localOutput(testFile)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the reason for moving from local file to in memory?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In #12836, spark and Core have changed their behavior, so I aligned this part. |
||
| Parquet.write(output) | ||
| .schema(writeSchema) | ||
| .createWriterFunc(GenericParquetWriter::create) | ||
| .build()) { | ||
| writer.addAll(iterable); | ||
| } | ||
|
|
||
| try (CloseableIterable<RowData> reader = | ||
| Parquet.read(Files.localInput(testFile)) | ||
| Parquet.read(output.toInputFile()) | ||
| .project(expectedSchema) | ||
| .createReaderFunc(type -> FlinkParquetReaders.buildReader(expectedSchema, type)) | ||
| .createReaderFunc( | ||
| type -> FlinkParquetReaders.buildReader(expectedSchema, type, ID_TO_CONSTANT)) | ||
| .build()) { | ||
| Iterator<Record> expected = iterable.iterator(); | ||
| Iterator<RowData> rows = reader.iterator(); | ||
| LogicalType rowType = FlinkSchemaUtil.convert(writeSchema); | ||
| for (int i = 0; i < NUM_RECORDS; i += 1) { | ||
| int pos = 0; | ||
| for (Record record : iterable) { | ||
| assertThat(rows).hasNext(); | ||
| TestHelpers.assertRowData(writeSchema.asStruct(), rowType, expected.next(), rows.next()); | ||
| TestHelpers.assertRowData( | ||
| writeSchema.asStruct(), rowType, record, rows.next(), ID_TO_CONSTANT, pos++); | ||
| } | ||
|
|
||
| assertThat(rows).isExhausted(); | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The PR needs to go against the 2.1 target (currently: 2.0), as #13714 has been merged.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we focus on the 2.0.0 version for this PR first, and then backport it to 2.1.0 and 1.20 later?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm fine with both.