-
Notifications
You must be signed in to change notification settings - Fork 3k
First version of the Iceberg sink for Apache Beam #1972
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d42ad5f
cd8f687
98e1004
0320353
480016a
085068a
f6d97ef
3580531
c4ad6d7
f62941f
b0ddc18
a92df28
c6b417f
7e77a3b
8e26fe2
099ed36
32a4bec
98433c5
e0adcb1
4105b5e
b57def4
a006045
3a99c74
45a3716
b5ba1ac
6fdb60e
d5ae150
a6116b2
a11eb04
76016be
7c88e68
70f4733
a2ab6e3
2d7ce2e
3399000
9e5f8f1
e5499b9
5f8badd
cefb84a
49b4340
edcb3e7
9bb635b
48f9f4c
c6e239e
f5c0167
76dbb26
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ | |
|
|
||
| package org.apache.iceberg.catalog; | ||
|
|
||
| import java.io.Serializable; | ||
| import java.util.Arrays; | ||
| import java.util.Objects; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
|
|
@@ -28,7 +29,7 @@ | |
| /** | ||
| * Identifies a table in iceberg catalog. | ||
| */ | ||
| public class TableIdentifier { | ||
| public class TableIdentifier implements Serializable { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For some of these classes where they are implementing Serializable, should we consider adding a I have many users (specifically for Flink) that encounter subtle issues when some Serializable class does not specify I'm not sure if Additionally, does anybody have any arguments for or against including serial version uid on the interfaces which are also extending serializable? For example, |
||
|
|
||
| private static final Splitter DOT = Splitter.on('.'); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.iceberg.beam; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.Map; | ||
| import org.apache.avro.generic.GenericRecord; | ||
| import org.apache.iceberg.FileFormat; | ||
| import org.apache.iceberg.MetricsConfig; | ||
| import org.apache.iceberg.PartitionSpec; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.StructLike; | ||
| import org.apache.iceberg.avro.Avro; | ||
| import org.apache.iceberg.avro.GenericAvroWriter; | ||
| import org.apache.iceberg.beam.writers.GenericAvroOrcWriter; | ||
| import org.apache.iceberg.beam.writers.GenericAvroParquetWriter; | ||
| import org.apache.iceberg.deletes.EqualityDeleteWriter; | ||
| import org.apache.iceberg.deletes.PositionDeleteWriter; | ||
| import org.apache.iceberg.encryption.EncryptedOutputFile; | ||
| import org.apache.iceberg.exceptions.RuntimeIOException; | ||
| import org.apache.iceberg.io.DataWriter; | ||
| import org.apache.iceberg.io.FileAppender; | ||
| import org.apache.iceberg.io.FileAppenderFactory; | ||
| import org.apache.iceberg.io.OutputFile; | ||
| import org.apache.iceberg.orc.ORC; | ||
| import org.apache.iceberg.parquet.Parquet; | ||
|
|
||
| public class BeamAppenderFactory<T extends GenericRecord> implements FileAppenderFactory<T> { | ||
| private final Schema schema; | ||
| private final Map<String, String> properties; | ||
| private final PartitionSpec spec; | ||
|
|
||
| BeamAppenderFactory(Schema schema, Map<String, String> properties, PartitionSpec spec) { | ||
| this.schema = schema; | ||
| this.properties = properties; | ||
| this.spec = spec; | ||
| } | ||
|
|
||
| @Override | ||
| public FileAppender<T> newAppender(OutputFile file, FileFormat fileFormat) { | ||
| MetricsConfig metricsConfig = MetricsConfig.fromProperties(properties); | ||
| try { | ||
| switch (fileFormat) { | ||
| case AVRO: | ||
| return Avro | ||
| .write(file) | ||
| .createWriterFunc(GenericAvroWriter::new) | ||
| .setAll(properties) | ||
| .schema(schema) | ||
| .overwrite() | ||
| .build(); | ||
|
|
||
| case PARQUET: | ||
| return Parquet.write(file) | ||
| .createWriterFunc(GenericAvroParquetWriter::buildWriter) | ||
| .setAll(properties) | ||
| .metricsConfig(metricsConfig) | ||
| .schema(schema) | ||
| .overwrite() | ||
| .build(); | ||
|
|
||
| case ORC: | ||
| return ORC.write(file) | ||
| .createWriterFunc(GenericAvroOrcWriter::buildWriter) | ||
| .setAll(properties) | ||
| .metricsConfig(metricsConfig) | ||
| .schema(schema) | ||
| .overwrite() | ||
| .build(); | ||
|
|
||
| default: | ||
| throw new UnsupportedOperationException("Cannot write unknown format: " + fileFormat); | ||
| } | ||
| } catch (IOException e) { | ||
| throw new RuntimeIOException(e); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public DataWriter<T> newDataWriter(EncryptedOutputFile file, FileFormat format, StructLike partition) { | ||
| return new DataWriter<>(newAppender(file.encryptingOutputFile(), format), format, | ||
| file.encryptingOutputFile().location(), spec, partition, file.keyMetadata()); | ||
| } | ||
|
|
||
| @Override | ||
| public EqualityDeleteWriter<T> newEqDeleteWriter(EncryptedOutputFile file, FileFormat format, | ||
| StructLike partition) { | ||
| return null; | ||
| } | ||
|
|
||
| @Override | ||
| public PositionDeleteWriter<T> newPosDeleteWriter(EncryptedOutputFile file, FileFormat format, | ||
| StructLike partition) { | ||
| return null; | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,178 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.apache.iceberg.beam; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.Locale; | ||
| import java.util.Map; | ||
| import org.apache.avro.generic.GenericRecord; | ||
| import org.apache.beam.sdk.transforms.DoFn; | ||
| import org.apache.beam.sdk.transforms.windowing.BoundedWindow; | ||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.iceberg.DataFile; | ||
| import org.apache.iceberg.FileFormat; | ||
| import org.apache.iceberg.PartitionKey; | ||
| import org.apache.iceberg.PartitionSpec; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.Table; | ||
| import org.apache.iceberg.TableProperties; | ||
| import org.apache.iceberg.catalog.TableIdentifier; | ||
| import org.apache.iceberg.encryption.EncryptionManager; | ||
| import org.apache.iceberg.hive.HiveCatalog; | ||
| import org.apache.iceberg.io.FileIO; | ||
| import org.apache.iceberg.io.LocationProvider; | ||
| import org.apache.iceberg.io.OutputFileFactory; | ||
| import org.apache.iceberg.io.PartitionedWriter; | ||
| import org.apache.iceberg.io.TaskWriter; | ||
| import org.apache.iceberg.io.UnpartitionedWriter; | ||
| import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; | ||
| import org.joda.time.Instant; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| public class FileWriter<T extends GenericRecord> extends DoFn<T, DataFile> { | ||
| private static final Logger LOG = LoggerFactory.getLogger(FileWriter.class); | ||
|
|
||
| private final PartitionSpec spec; | ||
| private final TableIdentifier tableIdentifier; | ||
| private final String hiveMetastoreUrl; | ||
|
|
||
| private Map<String, String> properties; | ||
| private Schema schema; | ||
| private LocationProvider locations; | ||
| private FileIO io; | ||
| private EncryptionManager encryptionManager; | ||
| private HiveCatalog catalog; | ||
| private FileFormat fileFormat; | ||
|
|
||
| private transient TaskWriter<T> writer; | ||
| private transient BoundedWindow lastSeenWindow; | ||
|
|
||
| public FileWriter( | ||
| TableIdentifier tableIdentifier, | ||
| Schema schema, | ||
| PartitionSpec spec, | ||
| String hiveMetastoreUrl, | ||
| Map<String, String> properties | ||
| ) { | ||
| this.tableIdentifier = tableIdentifier; | ||
| this.spec = spec; | ||
| this.hiveMetastoreUrl = hiveMetastoreUrl; | ||
| this.schema = schema; | ||
| this.properties = properties; | ||
| } | ||
|
|
||
| @StartBundle | ||
| public void startBundle(StartBundleContext sbc) { | ||
| start(); | ||
| } | ||
|
|
||
| @ProcessElement | ||
| public void processElement(ProcessContext context, BoundedWindow window) { | ||
| appendRecord( | ||
| context.element(), | ||
| window, | ||
| (int) context.pane().getIndex(), | ||
| context.pane().getIndex() | ||
| ); | ||
| } | ||
|
|
||
| @FinishBundle | ||
| public void finishBundle(FinishBundleContext fbc) { | ||
| DataFile[] files = finish(); | ||
| for (DataFile file : files) { | ||
| fbc.output(file, Instant.now(), lastSeenWindow); | ||
| } | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| public void start() { | ||
| Configuration conf = new Configuration(); | ||
| for (String key : this.properties.keySet()) { | ||
| conf.set(key, this.properties.get(key)); | ||
| } | ||
| catalog = new HiveCatalog( | ||
| HiveCatalog.DEFAULT_NAME, | ||
| this.hiveMetastoreUrl, | ||
| 1, | ||
| conf | ||
| ); | ||
| Table table = HiveCatalogHelper.loadOrCreateTable( | ||
| catalog, | ||
| tableIdentifier, | ||
| schema, | ||
| spec, | ||
| properties | ||
| ); | ||
| this.schema = table.schema(); | ||
| this.locations = table.locationProvider(); | ||
| this.properties = table.properties(); | ||
| this.io = table.io(); | ||
| this.encryptionManager = table.encryption(); | ||
| String formatString = table.properties().getOrDefault( | ||
| TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); | ||
| this.fileFormat = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| public void appendRecord(T element, BoundedWindow window, int partitionId, long taskId) { | ||
| if (writer == null) { | ||
| LOG.info("Setting up the writer"); | ||
| // We would rather do this in the startBundle, but we don't know the pane | ||
|
|
||
| BeamAppenderFactory<T> appenderFactory = new BeamAppenderFactory<>(schema, properties, spec); | ||
| OutputFileFactory fileFactory = new OutputFileFactory( | ||
| spec, fileFormat, locations, io, encryptionManager, partitionId, taskId); | ||
|
|
||
| if (spec.isUnpartitioned()) { | ||
| writer = new UnpartitionedWriter<>(spec, fileFormat, appenderFactory, fileFactory, io, Long.MAX_VALUE); | ||
| } else { | ||
| writer = new PartitionedWriter<T>( | ||
| spec, fileFormat, appenderFactory, fileFactory, io, Long.MAX_VALUE) { | ||
| @Override | ||
| protected PartitionKey partition(T row) { | ||
| return new PartitionKey(spec, schema); | ||
| } | ||
| }; | ||
| } | ||
| } | ||
| try { | ||
| lastSeenWindow = window; | ||
| writer.write(element); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if we get records for multiple partitions? I think |
||
| } catch (IOException e) { | ||
| throw new RuntimeException(e); | ||
| } | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| public DataFile[] finish() { | ||
| LOG.info("Closing the writer"); | ||
| try { | ||
| writer.close(); | ||
| return writer.dataFiles(); | ||
| } catch (IOException e) { | ||
| throw new RuntimeException(e); | ||
| } finally { | ||
| writer = null; | ||
| catalog.close(); | ||
| catalog = null; | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.