-
Notifications
You must be signed in to change notification settings - Fork 3k
Flink: FLIP-27 Iceberg source split #3501
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
99262f2
02b9539
6290574
c874f42
29ce395
51b056d
efb813c
fab7e11
1bcd5c3
6a8657b
e9b2c84
7a0f9cb
dc90b21
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,122 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.iceberg.flink.source.split; | ||
|
|
||
| import java.io.IOException; | ||
| import java.io.Serializable; | ||
| import java.util.Collection; | ||
| import java.util.stream.Collectors; | ||
| import javax.annotation.Nullable; | ||
| import org.apache.flink.annotation.Internal; | ||
| import org.apache.flink.api.connector.source.SourceSplit; | ||
| import org.apache.flink.util.InstantiationUtil; | ||
| import org.apache.iceberg.CombinedScanTask; | ||
| import org.apache.iceberg.FileScanTask; | ||
| import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Iterables; | ||
|
|
||
| @Internal | ||
| public class IcebergSourceSplit implements SourceSplit, Serializable { | ||
openinx marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| private static final long serialVersionUID = 1L; | ||
|
|
||
| private final CombinedScanTask task; | ||
|
|
||
| private int fileOffset; | ||
| private long recordOffset; | ||
|
|
||
| // The splits are frequently serialized into checkpoints. | ||
| // Caching the byte representation makes repeated serialization cheap. | ||
| @Nullable | ||
| private transient byte[] serializedBytesCache; | ||
|
|
||
| private IcebergSourceSplit(CombinedScanTask task, int fileOffset, long recordOffset) { | ||
| this.task = task; | ||
| this.fileOffset = fileOffset; | ||
| this.recordOffset = recordOffset; | ||
| } | ||
|
|
||
| public static IcebergSourceSplit fromCombinedScanTask(CombinedScanTask combinedScanTask) { | ||
| return fromCombinedScanTask(combinedScanTask, 0, 0L); | ||
| } | ||
|
|
||
| public static IcebergSourceSplit fromCombinedScanTask( | ||
| CombinedScanTask combinedScanTask, int fileOffset, long recordOffset) { | ||
| return new IcebergSourceSplit(combinedScanTask, fileOffset, recordOffset); | ||
| } | ||
|
|
||
| public CombinedScanTask task() { | ||
| return task; | ||
| } | ||
|
|
||
| public int fileOffset() { | ||
| return fileOffset; | ||
| } | ||
|
|
||
| public long recordOffset() { | ||
| return recordOffset; | ||
| } | ||
|
|
||
| @Override | ||
| public String splitId() { | ||
| return MoreObjects.toStringHelper(this) | ||
| .add("files", toString(task.files())) | ||
| .toString(); | ||
| } | ||
|
|
||
| public void updatePosition(int newFileOffset, long newRecordOffset) { | ||
| // invalidate the cache after position change | ||
| serializedBytesCache = null; | ||
| fileOffset = newFileOffset; | ||
| recordOffset = newRecordOffset; | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return MoreObjects.toStringHelper(this) | ||
| .add("files", toString(task.files())) | ||
| .add("fileOffset", fileOffset) | ||
| .add("recordOffset", recordOffset) | ||
| .toString(); | ||
| } | ||
|
|
||
| private String toString(Collection<FileScanTask> files) { | ||
| return Iterables.toString(files.stream().map(fileScanTask -> | ||
| MoreObjects.toStringHelper(fileScanTask) | ||
| .add("file", fileScanTask.file().path().toString()) | ||
| .add("start", fileScanTask.start()) | ||
| .add("length", fileScanTask.length()) | ||
| .toString()).collect(Collectors.toList())); | ||
| } | ||
|
|
||
| byte[] serializeV1() throws IOException { | ||
| if (serializedBytesCache == null) { | ||
| serializedBytesCache = InstantiationUtil.serializeObject(this); | ||
| } | ||
| return serializedBytesCache; | ||
| } | ||
|
|
||
| static IcebergSourceSplit deserializeV1(byte[] serialized) throws IOException { | ||
| try { | ||
| return InstantiationUtil.deserializeObject(serialized, IcebergSourceSplit.class.getClassLoader()); | ||
| } catch (ClassNotFoundException e) { | ||
| throw new RuntimeException("Failed to deserialize the split.", e); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.iceberg.flink.source.split; | ||
|
|
||
| import java.io.IOException; | ||
| import org.apache.flink.annotation.Internal; | ||
| import org.apache.flink.core.io.SimpleVersionedSerializer; | ||
|
|
||
| /** | ||
| * TODO: use Java serialization for now. | ||
| * Will switch to more stable serializer from | ||
| * <a href="https://github.com/apache/iceberg/issues/1698">issue-1698</a>. | ||
| */ | ||
| @Internal | ||
| public class IcebergSourceSplitSerializer implements SimpleVersionedSerializer<IcebergSourceSplit> { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will this get used to serialize / deserialize the splits across task boundaries, or just for checkpoints? The comment in
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question for my own understanding after looking through It seems like Is that correct? Are there any best practices when working with
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This class is used for checkpoint state serializer. cross-process (JM->TM) communication is via Java serializable. Currently, we are using the Java serializable inside this class too for simpler start. This is not ideal, as we know Java serializable is not good with schema evolution. Schema evolution would be important for long-running streaming jobs (not so much for batch jobs). In the class Javadoc, we linked to an issue for future improvement. Note that this is already an issue for the current
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| public static final IcebergSourceSplitSerializer INSTANCE = new IcebergSourceSplitSerializer(); | ||
| private static final int VERSION = 1; | ||
|
|
||
| @Override | ||
| public int getVersion() { | ||
| return VERSION; | ||
| } | ||
|
|
||
| @Override | ||
| public byte[] serialize(IcebergSourceSplit split) throws IOException { | ||
| return split.serializeV1(); | ||
| } | ||
|
|
||
| @Override | ||
| public IcebergSourceSplit deserialize(int version, byte[] serialized) throws IOException { | ||
| switch (version) { | ||
| case 1: | ||
| return IcebergSourceSplit.deserializeV1(serialized); | ||
| default: | ||
| throw new IOException(String.format("Failed to deserialize IcebergSourceSplit. " + | ||
| "Encountered unsupported version: %d. Supported version are [1]", version)); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why add this switch in this PR ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will recommend to make this PR to add the flip-27 source split as focused as possible. So it will recommend to remove the unrelated changes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
column stats are needed for event time/watermark aligned assigner. You are correct that it is directly used by this PR. Right now, I am taking the approach of splitting sub PRs at minimally connected files for easier creation of the sub PRs. if you think it is important to avoid unrelated changes inside a file, I can revert the piece of change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@stevenzwu How is your feeling for this comment ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is a reasonable addition. I think the motivation is to change the file in just this PR and not in the others that are part of FLIP-27.