-
Notifications
You must be signed in to change notification settings - Fork 3k
Flink: Introduce Flink InputFormat #1346
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e0fce7f
b6e9ecd
b13ed9b
8d080ad
3c727fa
6c5f830
e27e421
45af8c7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.iceberg.flink; | ||
|
|
||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.types.FixupTypes; | ||
| import org.apache.iceberg.types.Type; | ||
| import org.apache.iceberg.types.TypeUtil; | ||
| import org.apache.iceberg.types.Types; | ||
|
|
||
| /** | ||
| * The uuid and fixed are converted to the same Flink type. Conversion back can produce only one, | ||
| * which may not be correct. | ||
| */ | ||
| class FlinkFixupTypes extends FixupTypes { | ||
|
|
||
| private FlinkFixupTypes(Schema referenceSchema) { | ||
| super(referenceSchema); | ||
| } | ||
|
|
||
| static Schema fixup(Schema schema, Schema referenceSchema) { | ||
| return new Schema(TypeUtil.visit(schema, | ||
| new FlinkFixupTypes(referenceSchema)).asStructType().fields()); | ||
| } | ||
|
|
||
| @Override | ||
| protected boolean fixupPrimitive(Type.PrimitiveType type, Type source) { | ||
| if (type instanceof Types.FixedType) { | ||
| int length = ((Types.FixedType) type).length(); | ||
| return source.typeId() == Type.TypeID.UUID && length == 16; | ||
| } | ||
| return false; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,18 +39,14 @@ | |
| public class FlinkOrcReader implements OrcRowReader<RowData> { | ||
| private final OrcValueReader<?> reader; | ||
|
|
||
| private FlinkOrcReader(Schema iSchema, TypeDescription readSchema) { | ||
| public FlinkOrcReader(Schema iSchema, TypeDescription readSchema) { | ||
| this(iSchema, readSchema, ImmutableMap.of()); | ||
| } | ||
|
|
||
| private FlinkOrcReader(Schema iSchema, TypeDescription readSchema, Map<Integer, ?> idToConstant) { | ||
| public FlinkOrcReader(Schema iSchema, TypeDescription readSchema, Map<Integer, ?> idToConstant) { | ||
| this.reader = OrcSchemaWithTypeVisitor.visit(iSchema, readSchema, new ReadBuilder(idToConstant)); | ||
| } | ||
|
|
||
| public static OrcRowReader<RowData> buildReader(Schema schema, TypeDescription readSchema) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was this not used anywhere?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because there are two constructors, so we need to have two static helpers, I think we can use constructors directly.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is just used by testing code. |
||
| return new FlinkOrcReader(schema, readSchema); | ||
| } | ||
|
|
||
| @Override | ||
| public RowData read(VectorizedRowBatch batch, int row) { | ||
| return (RowData) reader.read(new StructColumnVector(batch.size, batch.cols), row); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,6 +36,7 @@ | |
| import org.apache.flink.table.data.TimestampData; | ||
| import org.apache.iceberg.orc.OrcValueReader; | ||
| import org.apache.iceberg.orc.OrcValueReaders; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Maps; | ||
| import org.apache.iceberg.types.Types; | ||
|
|
@@ -127,6 +128,11 @@ private static class Decimal18Reader implements OrcValueReader<DecimalData> { | |
| @Override | ||
| public DecimalData nonNullRead(ColumnVector vector, int row) { | ||
| HiveDecimalWritable value = ((DecimalColumnVector) vector).vector[row]; | ||
|
|
||
| // The hive ORC writer may will adjust the scale of decimal data. | ||
| Preconditions.checkArgument(value.precision() <= precision, | ||
| "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does Flink require this?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is better to add this check to avoid potential precision mismatched bugs. |
||
|
|
||
| return DecimalData.fromUnscaledLong(value.serialize64(scale), precision, scale); | ||
| } | ||
| } | ||
|
|
@@ -143,6 +149,10 @@ private static class Decimal38Reader implements OrcValueReader<DecimalData> { | |
| @Override | ||
| public DecimalData nonNullRead(ColumnVector vector, int row) { | ||
| BigDecimal value = ((DecimalColumnVector) vector).vector[row].getHiveDecimal().bigDecimalValue(); | ||
|
|
||
| Preconditions.checkArgument(value.precision() <= precision, | ||
| "Cannot read value as decimal(%s,%s), too large: %s", precision, scale, value); | ||
|
|
||
| return DecimalData.fromBigDecimal(value, precision, scale); | ||
| } | ||
| } | ||
|
|
@@ -246,7 +256,7 @@ private static class StructReader extends OrcValueReaders.StructReader<RowData> | |
|
|
||
| StructReader(List<OrcValueReader<?>> readers, Types.StructType struct, Map<Integer, ?> idToConstant) { | ||
| super(readers, struct, idToConstant); | ||
| this.numFields = readers.size(); | ||
| this.numFields = struct.fields().size(); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This fixes Flink Orc Reader (with partition) bug.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice catch, if the schema is a projected read schema, then the |
||
| } | ||
|
|
||
| @Override | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,140 @@ | ||||||
| /* | ||||||
| * Licensed to the Apache Software Foundation (ASF) under one | ||||||
| * or more contributor license agreements. See the NOTICE file | ||||||
| * distributed with this work for additional information | ||||||
| * regarding copyright ownership. The ASF licenses this file | ||||||
| * to you under the Apache License, Version 2.0 (the | ||||||
| * "License"); you may not use this file except in compliance | ||||||
| * with the License. You may obtain a copy of the License at | ||||||
| * | ||||||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||||||
| * | ||||||
| * Unless required by applicable law or agreed to in writing, | ||||||
| * software distributed under the License is distributed on an | ||||||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||||||
| * KIND, either express or implied. See the License for the | ||||||
| * specific language governing permissions and limitations | ||||||
| * under the License. | ||||||
| */ | ||||||
|
|
||||||
| package org.apache.iceberg.flink.source; | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, you put all reader related classes inside the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think we can. |
||||||
|
|
||||||
| import java.io.IOException; | ||||||
| import java.io.UncheckedIOException; | ||||||
| import java.math.BigDecimal; | ||||||
| import java.nio.ByteBuffer; | ||||||
| import java.util.Iterator; | ||||||
| import org.apache.avro.generic.GenericData; | ||||||
| import org.apache.avro.util.Utf8; | ||||||
| import org.apache.flink.table.data.DecimalData; | ||||||
| import org.apache.flink.table.data.StringData; | ||||||
| import org.apache.flink.table.data.TimestampData; | ||||||
| import org.apache.iceberg.CombinedScanTask; | ||||||
| import org.apache.iceberg.FileScanTask; | ||||||
| import org.apache.iceberg.encryption.EncryptedFiles; | ||||||
| import org.apache.iceberg.encryption.EncryptionManager; | ||||||
| import org.apache.iceberg.io.CloseableIterator; | ||||||
| import org.apache.iceberg.io.FileIO; | ||||||
| import org.apache.iceberg.io.InputFile; | ||||||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||||||
| import org.apache.iceberg.types.Type; | ||||||
| import org.apache.iceberg.types.Types; | ||||||
| import org.apache.iceberg.util.ByteBuffers; | ||||||
| import org.apache.iceberg.util.DateTimeUtil; | ||||||
|
|
||||||
| /** | ||||||
| * Base class of Flink iterators. | ||||||
| * | ||||||
| * @param <T> is the Java class returned by this iterator whose objects contain one or more rows. | ||||||
| */ | ||||||
| abstract class DataIterator<T> implements CloseableIterator<T> { | ||||||
|
|
||||||
| private Iterator<FileScanTask> tasks; | ||||||
| private final FileIO io; | ||||||
| private final EncryptionManager encryption; | ||||||
|
|
||||||
| private CloseableIterator<T> currentIterator; | ||||||
|
|
||||||
| DataIterator(CombinedScanTask task, FileIO io, EncryptionManager encryption) { | ||||||
| this.tasks = task.files().iterator(); | ||||||
| this.io = io; | ||||||
| this.encryption = encryption; | ||||||
| this.currentIterator = CloseableIterator.empty(); | ||||||
| } | ||||||
|
|
||||||
| InputFile getInputFile(FileScanTask task) { | ||||||
| Preconditions.checkArgument(!task.isDataTask(), "Invalid task type"); | ||||||
| return encryption.decrypt(EncryptedFiles.encryptedInput( | ||||||
| io.newInputFile(task.file().path().toString()), | ||||||
| task.file().keyMetadata())); | ||||||
| } | ||||||
|
|
||||||
| @Override | ||||||
| public boolean hasNext() { | ||||||
| updateCurrentIterator(); | ||||||
| return currentIterator.hasNext(); | ||||||
| } | ||||||
|
|
||||||
| @Override | ||||||
| public T next() { | ||||||
| updateCurrentIterator(); | ||||||
| return currentIterator.next(); | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * Updates the current iterator field to ensure that the current Iterator | ||||||
| * is not exhausted. | ||||||
| */ | ||||||
| private void updateCurrentIterator() { | ||||||
| try { | ||||||
| while (!currentIterator.hasNext() && tasks.hasNext()) { | ||||||
| currentIterator.close(); | ||||||
| currentIterator = openTaskIterator(tasks.next()); | ||||||
| } | ||||||
| } catch (IOException e) { | ||||||
| throw new UncheckedIOException(e); | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| abstract CloseableIterator<T> openTaskIterator(FileScanTask scanTask) throws IOException; | ||||||
|
|
||||||
| @Override | ||||||
| public void close() throws IOException { | ||||||
| // close the current iterator | ||||||
| currentIterator.close(); | ||||||
| tasks = null; | ||||||
| } | ||||||
|
|
||||||
| static Object convertConstant(Type type, Object value) { | ||||||
| if (value == null) { | ||||||
| return null; | ||||||
| } | ||||||
|
|
||||||
| switch (type.typeId()) { | ||||||
| case DECIMAL: // DecimalData | ||||||
| Types.DecimalType decimal = (Types.DecimalType) type; | ||||||
| return DecimalData.fromBigDecimal((BigDecimal) value, decimal.precision(), decimal.scale()); | ||||||
| case STRING: // StringData | ||||||
| if (value instanceof Utf8) { | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Q: is it possible that we will step into this
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is good to keep it safe. |
||||||
| Utf8 utf8 = (Utf8) value; | ||||||
| return StringData.fromBytes(utf8.getBytes(), 0, utf8.getByteLength()); | ||||||
| } | ||||||
| return StringData.fromString(value.toString()); | ||||||
| case FIXED: // byte[] | ||||||
| if (value instanceof byte[]) { | ||||||
| return value; | ||||||
| } else if (value instanceof GenericData.Fixed) { | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same question here, would it be possible that the value is a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
I think it is good to keep it safe. |
||||||
| return ((GenericData.Fixed) value).bytes(); | ||||||
| } | ||||||
| return ByteBuffers.toByteArray((ByteBuffer) value); | ||||||
| case BINARY: // byte[] | ||||||
| return ByteBuffers.toByteArray((ByteBuffer) value); | ||||||
| case TIME: // int mills instead of long | ||||||
| return (int) ((Long) value / 1000); | ||||||
| case TIMESTAMP: // TimestampData | ||||||
| return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); | ||||||
| default: | ||||||
| } | ||||||
| return value; | ||||||
| } | ||||||
| } | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have a discussion in #1302 for removing UUID. Is this a temporary solution?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes