diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java index 133dcb0577bab..9572af5237d25 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java @@ -31,7 +31,7 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.SchemaTestUtil; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.avro.Schema; import org.apache.hadoop.fs.FileStatus; @@ -80,7 +80,7 @@ public void init() throws Exception { // generate 200 records Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(HoodieCLI.getTableMetaClient(), schema); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(HoodieCLI.getTableMetaClient(), schema); HoodieRecord[] hoodieRecords1 = SchemaTestUtil.generateHoodieTestRecords(0, 100, schema).toArray(new HoodieRecord[100]); HoodieRecord[] hoodieRecords2 = SchemaTestUtil.generateHoodieTestRecords(100, 100, schema).toArray(new HoodieRecord[100]); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index bf9e203627e6b..9b0d40cc180eb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -18,22 +18,24 @@ package org.apache.hudi.config; -import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.bootstrap.BootstrapMode; import org.apache.hudi.common.config.DefaultHoodieConfig; import org.apache.hudi.common.fs.ConsistencyGuardConfig; import org.apache.hudi.client.common.EngineType; import org.apache.hudi.common.model.HoodieCleaningPolicy; +import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.execution.bulkinsert.BulkInsertSortMode; import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.SimpleAvroKeyGenerator; import org.apache.hudi.metrics.MetricsReporterType; import org.apache.hudi.metrics.datadog.DatadogHttpClient.ApiSite; import org.apache.hudi.table.action.compact.strategy.CompactionStrategy; +import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import javax.annotation.concurrent.Immutable; @@ -59,6 +61,11 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { private static final long serialVersionUID = 0L; public static final String TABLE_NAME = "hoodie.table.name"; + public static final String PRECOMBINE_FIELD_PROP = "hoodie.datasource.write.precombine.field"; + public static final String WRITE_PAYLOAD_CLASS = "hoodie.datasource.write.payload.class"; + public static final String DEFAULT_WRITE_PAYLOAD_CLASS = OverwriteWithLatestAvroPayload.class.getName(); + public static final String KEYGENERATOR_CLASS_PROP = "hoodie.datasource.write.keygenerator.class"; + public static final String DEFAULT_KEYGENERATOR_CLASS = SimpleAvroKeyGenerator.class.getName(); public static final String DEFAULT_ROLLBACK_USING_MARKERS = "false"; public static final String ROLLBACK_USING_MARKERS = "hoodie.rollback.using.markers"; public static final String TIMELINE_LAYOUT_VERSION = "hoodie.timeline.layout.version"; @@ -194,6 +201,18 @@ public String getTableName() { return props.getProperty(TABLE_NAME); } + public String getPreCombineField() { + return props.getProperty(PRECOMBINE_FIELD_PROP); + } + + public String getWritePayloadClass() { + return props.getProperty(WRITE_PAYLOAD_CLASS); + } + + public String getKeyGeneratorClass() { + return props.getProperty(KEYGENERATOR_CLASS_PROP); + } + public Boolean shouldAutoCommit() { return Boolean.parseBoolean(props.getProperty(HOODIE_AUTO_COMMIT_PROP)); } @@ -902,6 +921,21 @@ public Builder forTable(String tableName) { return this; } + public Builder withPreCombineField(String preCombineField) { + props.setProperty(PRECOMBINE_FIELD_PROP, preCombineField); + return this; + } + + public Builder withWritePayLoad(String payload) { + props.setProperty(WRITE_PAYLOAD_CLASS, payload); + return this; + } + + public Builder withKeyGenerator(String keyGeneratorClass) { + props.setProperty(KEYGENERATOR_CLASS_PROP, keyGeneratorClass); + return this; + } + public Builder withTimelineLayoutVersion(int version) { props.setProperty(TIMELINE_LAYOUT_VERSION, String.valueOf(version)); return this; @@ -1094,6 +1128,10 @@ protected void setDefaults() { setDefaultOnCondition(props, !props.containsKey(DELETE_PARALLELISM), DELETE_PARALLELISM, DEFAULT_PARALLELISM); setDefaultOnCondition(props, !props.containsKey(ROLLBACK_PARALLELISM), ROLLBACK_PARALLELISM, DEFAULT_ROLLBACK_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(KEYGENERATOR_CLASS_PROP), + KEYGENERATOR_CLASS_PROP, DEFAULT_KEYGENERATOR_CLASS); + setDefaultOnCondition(props, !props.containsKey(WRITE_PAYLOAD_CLASS), + WRITE_PAYLOAD_CLASS, DEFAULT_WRITE_PAYLOAD_CLASS); setDefaultOnCondition(props, !props.containsKey(ROLLBACK_USING_MARKERS), ROLLBACK_USING_MARKERS, DEFAULT_ROLLBACK_USING_MARKERS); setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), COMBINE_BEFORE_INSERT_PROP, diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java similarity index 71% rename from hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java rename to hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java index e167a0f4b6650..2197dde8dd038 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java +++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java @@ -21,10 +21,8 @@ import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.HoodieAvroWriteSupport; -import org.apache.hudi.client.SparkTaskContextSupplier; +import org.apache.hudi.client.common.TaskContextSupplier; import org.apache.hudi.common.bloom.BloomFilter; -import org.apache.hudi.common.bloom.BloomFilterFactory; -import org.apache.hudi.common.bloom.BloomFilterTypeCode; import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordLocation; @@ -37,7 +35,6 @@ import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.io.storage.HoodieAvroParquetConfig; import org.apache.hudi.io.storage.HoodieParquetWriter; -import org.apache.hudi.table.HoodieTable; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; @@ -57,7 +54,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.UUID; import java.util.stream.Collectors; import static org.apache.hudi.common.testutils.FileCreateUtils.baseFileName; @@ -65,35 +61,15 @@ public class HoodieWriteableTestTable extends HoodieTestTable { private static final Logger LOG = LogManager.getLogger(HoodieWriteableTestTable.class); - private final Schema schema; - private final BloomFilter filter; + protected final Schema schema; + protected final BloomFilter filter; - private HoodieWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) { + protected HoodieWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) { super(basePath, fs, metaClient); this.schema = schema; this.filter = filter; } - public static HoodieWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) { - return new HoodieWriteableTestTable(metaClient.getBasePath(), metaClient.getRawFs(), metaClient, schema, filter); - } - - public static HoodieWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema) { - BloomFilter filter = BloomFilterFactory - .createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); - return of(metaClient, schema, filter); - } - - public static HoodieWriteableTestTable of(HoodieTable hoodieTable, Schema schema) { - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - return of(metaClient, schema); - } - - public static HoodieWriteableTestTable of(HoodieTable hoodieTable, Schema schema, BloomFilter filter) { - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); - return of(metaClient, schema, filter); - } - @Override public HoodieWriteableTestTable addCommit(String instantTime) throws Exception { return (HoodieWriteableTestTable) super.addCommit(instantTime); @@ -104,29 +80,7 @@ public HoodieWriteableTestTable forCommit(String instantTime) { return (HoodieWriteableTestTable) super.forCommit(instantTime); } - public String getFileIdWithInserts(String partition) throws Exception { - return getFileIdWithInserts(partition, new HoodieRecord[0]); - } - - public String getFileIdWithInserts(String partition, HoodieRecord... records) throws Exception { - return getFileIdWithInserts(partition, Arrays.asList(records)); - } - - public String getFileIdWithInserts(String partition, List records) throws Exception { - String fileId = UUID.randomUUID().toString(); - withInserts(partition, fileId, records); - return fileId; - } - - public HoodieWriteableTestTable withInserts(String partition, String fileId) throws Exception { - return withInserts(partition, fileId, new HoodieRecord[0]); - } - - public HoodieWriteableTestTable withInserts(String partition, String fileId, HoodieRecord... records) throws Exception { - return withInserts(partition, fileId, Arrays.asList(records)); - } - - public HoodieWriteableTestTable withInserts(String partition, String fileId, List records) throws Exception { + public HoodieWriteableTestTable withInserts(String partition, String fileId, List records, TaskContextSupplier contextSupplier) throws Exception { FileCreateUtils.createPartitionMetaFile(basePath, partition); String fileName = baseFileName(currentInstantTime, fileId); @@ -138,7 +92,7 @@ public HoodieWriteableTestTable withInserts(String partition, String fileId, Lis try (HoodieParquetWriter writer = new HoodieParquetWriter( currentInstantTime, new Path(Paths.get(basePath, partition, fileName).toString()), - config, schema, new SparkTaskContextSupplier())) { + config, schema, contextSupplier)) { int seqId = 1; for (HoodieRecord record : records) { GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); diff --git a/hudi-client/hudi-flink-client/pom.xml b/hudi-client/hudi-flink-client/pom.xml index e07f0c672262e..721f8d758f7f7 100644 --- a/hudi-client/hudi-flink-client/pom.xml +++ b/hudi-client/hudi-flink-client/pom.xml @@ -115,6 +115,28 @@ test + + + org.apache.flink + flink-test-utils_${scala.binary.version} + ${flink.version} + test + + + org.apache.flink + flink-runtime_${scala.binary.version} + ${flink.version} + test + tests + + + org.apache.flink + flink-streaming-java_${scala.binary.version} + ${flink.version} + test + tests + + org.junit.jupiter diff --git a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkClientTestHarness.java b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkClientTestHarness.java new file mode 100644 index 0000000000000..4bd9fa39fe00e --- /dev/null +++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/testutils/HoodieFlinkClientTestHarness.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.testutils; + +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; +import org.apache.hudi.common.testutils.HoodieTestUtils; + +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.functions.sink.SinkFunction; +import org.apache.flink.test.util.MiniClusterWithClientResource; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestInfo; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +public class HoodieFlinkClientTestHarness extends HoodieCommonTestHarness implements Serializable { + + protected static final Logger LOG = LogManager.getLogger(HoodieFlinkClientTestHarness.class); + private String testMethodName; + protected transient Configuration hadoopConf = null; + protected transient FileSystem fs; + protected transient MiniClusterWithClientResource flinkCluster = null; + + @BeforeEach + public void setTestMethodName(TestInfo testInfo) { + if (testInfo.getTestMethod().isPresent()) { + testMethodName = testInfo.getTestMethod().get().getName(); + } else { + testMethodName = "Unknown"; + } + } + + protected void initFlinkMiniCluster() { + flinkCluster = new MiniClusterWithClientResource( + new MiniClusterResourceConfiguration.Builder() + .setNumberSlotsPerTaskManager(2) + .setNumberTaskManagers(1) + .build()); + } + + protected void initFileSystem() { + hadoopConf = new Configuration(); + initFileSystemWithConfiguration(hadoopConf); + } + + private void initFileSystemWithConfiguration(Configuration configuration) { + if (basePath == null) { + throw new IllegalStateException("The base path has not been initialized."); + } + fs = FSUtils.getFs(basePath, configuration); + if (fs instanceof LocalFileSystem) { + LocalFileSystem lfs = (LocalFileSystem) fs; + // With LocalFileSystem, with checksum disabled, fs.open() returns an inputStream which is FSInputStream + // This causes ClassCastExceptions in LogRecordScanner (and potentially other places) calling fs.open + // So, for the tests, we enforce checksum verification to circumvent the problem + lfs.setVerifyChecksum(true); + } + } + + /** + * Initializes an instance of {@link HoodieTableMetaClient} with a special table type specified by + * {@code getTableType()}. + * + * @throws IOException + */ + protected void initMetaClient() throws IOException { + initMetaClient(getTableType()); + } + + protected void initMetaClient(HoodieTableType tableType) throws IOException { + if (basePath == null) { + throw new IllegalStateException("The base path has not been initialized."); + } + metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType); + } + + + /** + * Cleanups file system. + * + * @throws IOException + */ + protected void cleanupFileSystem() throws IOException { + if (fs != null) { + LOG.warn("Closing file-system instance used in previous test-run"); + fs.close(); + fs = null; + } + } + + protected void cleanupFlinkMiniCluster() { + if (flinkCluster != null) { + flinkCluster.after(); + flinkCluster = null; + } + } + + public static class SimpleTestSinkFunction implements SinkFunction { + + // must be static + public static List valuesList = new ArrayList<>(); + + @Override + public synchronized void invoke(HoodieRecord value, Context context) throws Exception { + valuesList.add(value); + } + } +} diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java index d9a396d0f64b5..f4dffe899bd9d 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java @@ -62,7 +62,7 @@ import org.apache.hudi.table.action.commit.SparkWriteHelper; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.hudi.testutils.HoodieClientTestUtils; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; @@ -114,7 +114,7 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase { @BeforeEach public void setUpTestTable() { - testTable = HoodieWriteableTestTable.of(metaClient); + testTable = HoodieSparkWriteableTestTable.of(metaClient); } /** diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndex.java index f10e845f05b55..a18313b278656 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndex.java @@ -39,7 +39,7 @@ import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.Assertions; import org.apache.hudi.testutils.HoodieClientTestHarness; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.hudi.testutils.MetadataMergeWriteStatus; import org.apache.avro.Schema; @@ -280,7 +280,7 @@ public void testTagLocationAndFetchRecordLocations(IndexType indexType) throws E } // We create three parquet file, each having one record. (two different partitions) - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); String fileId1 = testTable.addCommit("001").getFileIdWithInserts(p1, record1); String fileId2 = testTable.addCommit("002").getFileIdWithInserts(p1, record2); String fileId3 = testTable.addCommit("003").getFileIdWithInserts(p2, record4); @@ -337,7 +337,7 @@ public void testSimpleGlobalIndexTagLocationWhenShouldUpdatePartitionPath(IndexT writeClient = getHoodieWriteClient(config); index = writeClient.getIndex(); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); final String p1 = "2016/01/31"; final String p2 = "2016/02/28"; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java index 8446698305846..b325eb6b1c404 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieBloomIndex.java @@ -33,7 +33,7 @@ import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.HoodieClientTestHarness; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.avro.Schema; import org.apache.hadoop.fs.Path; @@ -105,7 +105,7 @@ public void testLoadInvolvedFiles(boolean rangePruning, boolean treeFiltering, b HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); SparkHoodieBloomIndex index = new SparkHoodieBloomIndex(config); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); // Create some partitions, and put some files // "2016/01/21": 0 file @@ -222,7 +222,7 @@ public void testCheckUUIDsAgainstOneFile() throws Exception { // record2, record3). BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); filter.add(record3.getRecordKey()); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(metaClient, SCHEMA, filter); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, filter); String fileId = testTable.addCommit("000").getFileIdWithInserts(partition, record1, record2); String filename = testTable.getBaseFileNameById(fileId); @@ -298,7 +298,7 @@ public void testTagLocation(boolean rangePruning, boolean treeFiltering, boolean // Also create the metadata and config HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); HoodieSparkTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); // Let's tag SparkHoodieBloomIndex bloomIndex = new SparkHoodieBloomIndex(config); @@ -363,7 +363,7 @@ public void testCheckExists(boolean rangePruning, boolean treeFiltering, boolean // Also create the metadata and config HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); // Let's tag SparkHoodieBloomIndex bloomIndex = new SparkHoodieBloomIndex(config); @@ -432,7 +432,7 @@ public void testBloomFilterFalseError(boolean rangePruning, boolean treeFilterin BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); filter.add(record2.getRecordKey()); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(metaClient, SCHEMA, filter); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, filter); String fileId = testTable.addCommit("000").getFileIdWithInserts("2016/01/31", record1); assertTrue(filter.mightContain(record1.getRecordKey())); assertTrue(filter.mightContain(record2.getRecordKey())); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java index c18eeb1d770c8..ff87f967a1d0d 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bloom/TestHoodieGlobalBloomIndex.java @@ -27,7 +27,7 @@ import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.HoodieClientTestHarness; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.avro.Schema; import org.apache.spark.api.java.JavaPairRDD; @@ -76,7 +76,7 @@ public void testLoadInvolvedFiles() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); SparkHoodieGlobalBloomIndex index = new SparkHoodieGlobalBloomIndex(config); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); // Create some partitions, and put some files, along with the meta file // "2016/01/21": 0 file @@ -180,7 +180,7 @@ public void testTagLocation() throws Exception { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); SparkHoodieGlobalBloomIndex index = new SparkHoodieGlobalBloomIndex(config); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); // Create some partitions, and put some files, along with the meta file // "2016/01/21": 0 file @@ -261,7 +261,7 @@ public void testTagLocationWhenShouldUpdatePartitionPath() throws Exception { .build(); SparkHoodieGlobalBloomIndex index = new SparkHoodieGlobalBloomIndex(config); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, SCHEMA); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, SCHEMA); final String p1 = "2016/01/31"; final String p2 = "2016/02/28"; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java index ea13645fb954e..3f62494f9c94d 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/io/TestHoodieKeyLocationFetchHandle.java @@ -35,7 +35,7 @@ import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.HoodieClientTestHarness; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.hudi.testutils.MetadataMergeWriteStatus; import org.junit.jupiter.api.AfterEach; @@ -87,7 +87,7 @@ public void testFetchHandle() throws Exception { List records = dataGen.generateInserts(makeNewCommitTime(), 100); Map> partitionRecordsMap = recordsToPartitionRecordsMap(records); HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable testTable = HoodieWriteableTestTable.of(hoodieTable, AVRO_SCHEMA_WITH_METADATA_FIELDS); + HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, AVRO_SCHEMA_WITH_METADATA_FIELDS); Map, List>> expectedList = writeToParquetAndGetExpectedRecordLocations(partitionRecordsMap, testTable); @@ -103,7 +103,7 @@ public void testFetchHandle() throws Exception { } private Map, List>> writeToParquetAndGetExpectedRecordLocations( - Map> partitionRecordsMap, HoodieWriteableTestTable testTable) throws Exception { + Map> partitionRecordsMap, HoodieSparkWriteableTestTable testTable) throws Exception { Map, List>> expectedList = new HashMap<>(); for (Map.Entry> entry : partitionRecordsMap.entrySet()) { int totalRecordsPerPartition = entry.getValue().size(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java index 42584b12514b4..606a1f067a817 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java @@ -71,7 +71,7 @@ import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.hudi.testutils.HoodieClientTestUtils; import org.apache.hudi.testutils.HoodieMergeOnReadTestUtils; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.hudi.testutils.MetadataMergeWriteStatus; import org.apache.spark.api.java.JavaRDD; import org.junit.jupiter.api.AfterEach; @@ -999,7 +999,7 @@ public void testLogFileCountsAfterCompaction() throws Exception { // Write them to corresponding avro logfiles metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable table = HoodieSparkTable.create(config, context, metaClient); - HoodieWriteableTestTable.of(table, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS) + HoodieSparkWriteableTestTable.of(table, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS) .withLogAppends(updatedRecords); // In writeRecordsToLogFiles, no commit files are getting added, so resetting file-system view state ((SyncableFileSystemView) (table.getSliceView())).reset(); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java index 2e6cea70ad921..734fcc2969646 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/compact/TestHoodieCompactor.java @@ -44,7 +44,7 @@ import org.apache.hudi.table.HoodieSparkTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.testutils.HoodieClientTestHarness; -import org.apache.hudi.testutils.HoodieWriteableTestTable; +import org.apache.hudi.testutils.HoodieSparkWriteableTestTable; import org.apache.hadoop.conf.Configuration; import org.apache.spark.api.java.JavaRDD; @@ -155,7 +155,7 @@ public void testWriteStatusContentsAfterCompaction() throws Exception { updatedRecords = ((JavaRDD)index.tagLocation(updatedRecordsRDD, context, table)).collect(); // Write them to corresponding avro logfiles. Also, set the state transition properly. - HoodieWriteableTestTable.of(table, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS) + HoodieSparkWriteableTestTable.of(table, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS) .withLogAppends(updatedRecords); metaClient.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED, HoodieTimeline.DELTA_COMMIT_ACTION, newCommitTime), Option.empty()); diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java index b10781e3b8ec9..9fa1c47e814a7 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java @@ -28,7 +28,6 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.testutils.HoodieCommonTestHarness; -import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.testutils.minicluster.HdfsTestService; import org.apache.hudi.common.util.Option; @@ -72,7 +71,6 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im protected transient Configuration hadoopConf = null; protected transient SQLContext sqlContext; protected transient FileSystem fs; - protected transient HoodieTestDataGenerator dataGen = null; protected transient ExecutorService executorService; protected transient HoodieTableMetaClient metaClient; protected transient SparkRDDWriteClient writeClient; @@ -237,24 +235,6 @@ protected void cleanupClients() throws IOException { } } - /** - * Initializes a test data generator which used to generate test datas. - * - */ - protected void initTestDataGenerator() { - dataGen = new HoodieTestDataGenerator(); - } - - /** - * Cleanups test data generator. - * - */ - protected void cleanupTestDataGenerator() { - if (dataGen != null) { - dataGen = null; - } - } - /** * Initializes a distributed file system and base directory. * diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieSparkWriteableTestTable.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieSparkWriteableTestTable.java new file mode 100644 index 0000000000000..8e37c92d34a1d --- /dev/null +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/testutils/HoodieSparkWriteableTestTable.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.testutils; + +import org.apache.hudi.client.SparkTaskContextSupplier; +import org.apache.hudi.common.bloom.BloomFilter; +import org.apache.hudi.common.bloom.BloomFilterFactory; +import org.apache.hudi.common.bloom.BloomFilterTypeCode; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.table.HoodieTable; + +import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Arrays; +import java.util.List; +import java.util.UUID; + +public class HoodieSparkWriteableTestTable extends HoodieWriteableTestTable { + private static final Logger LOG = LogManager.getLogger(HoodieSparkWriteableTestTable.class); + + private HoodieSparkWriteableTestTable(String basePath, FileSystem fs, HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) { + super(basePath, fs, metaClient, schema, filter); + } + + public static HoodieSparkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema, BloomFilter filter) { + return new HoodieSparkWriteableTestTable(metaClient.getBasePath(), metaClient.getRawFs(), metaClient, schema, filter); + } + + public static HoodieSparkWriteableTestTable of(HoodieTableMetaClient metaClient, Schema schema) { + BloomFilter filter = BloomFilterFactory + .createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); + return of(metaClient, schema, filter); + } + + public static HoodieSparkWriteableTestTable of(HoodieTable hoodieTable, Schema schema) { + HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); + return of(metaClient, schema); + } + + public static HoodieSparkWriteableTestTable of(HoodieTable hoodieTable, Schema schema, BloomFilter filter) { + HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); + return of(metaClient, schema, filter); + } + + @Override + public HoodieSparkWriteableTestTable addCommit(String instantTime) throws Exception { + return (HoodieSparkWriteableTestTable) super.addCommit(instantTime); + } + + @Override + public HoodieSparkWriteableTestTable forCommit(String instantTime) { + return (HoodieSparkWriteableTestTable) super.forCommit(instantTime); + } + + public String getFileIdWithInserts(String partition) throws Exception { + return getFileIdWithInserts(partition, new HoodieRecord[0]); + } + + public String getFileIdWithInserts(String partition, HoodieRecord... records) throws Exception { + return getFileIdWithInserts(partition, Arrays.asList(records)); + } + + public String getFileIdWithInserts(String partition, List records) throws Exception { + String fileId = UUID.randomUUID().toString(); + withInserts(partition, fileId, records); + return fileId; + } + + public HoodieSparkWriteableTestTable withInserts(String partition, String fileId) throws Exception { + return withInserts(partition, fileId, new HoodieRecord[0]); + } + + public HoodieSparkWriteableTestTable withInserts(String partition, String fileId, HoodieRecord... records) throws Exception { + return withInserts(partition, fileId, Arrays.asList(records)); + } + + public HoodieSparkWriteableTestTable withInserts(String partition, String fileId, List records) throws Exception { + super.withInserts(partition, fileId, records, new SparkTaskContextSupplier()); + return this; + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java index 96a00da6f2b32..25b2c8baf9c7a 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java @@ -35,6 +35,7 @@ public class HoodieCommonTestHarness { protected String basePath = null; + protected transient HoodieTestDataGenerator dataGen = null; protected transient HoodieTableMetaClient metaClient; @TempDir public java.nio.file.Path tempDir; @@ -52,6 +53,24 @@ protected void initPath() { } } + /** + * Initializes a test data generator which used to generate test datas. + * + */ + protected void initTestDataGenerator() { + dataGen = new HoodieTestDataGenerator(); + } + + /** + * Cleanups test data generator. + * + */ + protected void cleanupTestDataGenerator() { + if (dataGen != null) { + dataGen = null; + } + } + /** * Initializes an instance of {@link HoodieTableMetaClient} with a special table type specified by * {@code getTableType()}. diff --git a/hudi-flink/pom.xml b/hudi-flink/pom.xml index 4b8cfd78e1009..4d9827485e6a7 100644 --- a/hudi-flink/pom.xml +++ b/hudi-flink/pom.xml @@ -173,5 +173,102 @@ bijection-avro_${scala.binary.version} 0.9.7 + + + + org.junit.jupiter + junit-jupiter-api + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + org.junit.vintage + junit-vintage-engine + test + + + org.junit.jupiter + junit-jupiter-params + test + + + org.mockito + mockito-junit-jupiter + test + + + org.junit.platform + junit-platform-runner + test + + + org.junit.platform + junit-platform-suite-api + test + + + org.junit.platform + junit-platform-commons + test + + + + + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-client-common + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-flink-client + ${project.version} + tests + test-jar + test + + + org.apache.hudi + hudi-hadoop-mr + ${project.version} + test + + + + + org.apache.flink + flink-test-utils_${scala.binary.version} + ${flink.version} + test + + + org.apache.flink + flink-runtime_${scala.binary.version} + ${flink.version} + test + tests + + + org.apache.flink + flink-streaming-java_${scala.binary.version} + ${flink.version} + test + tests + + diff --git a/hudi-flink/src/main/java/org/apache/hudi/HoodieFlinkStreamer.java b/hudi-flink/src/main/java/org/apache/hudi/HoodieFlinkStreamer.java index 0c9991da3db4a..cef4ddaf71ada 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/HoodieFlinkStreamer.java +++ b/hudi-flink/src/main/java/org/apache/hudi/HoodieFlinkStreamer.java @@ -19,9 +19,11 @@ package org.apache.hudi; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.operator.InstantGenerateOperator; import org.apache.hudi.operator.KeyedWriteProcessFunction; import org.apache.hudi.operator.KeyedWriteProcessOperator; @@ -42,11 +44,11 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; +import org.apache.kafka.clients.consumer.ConsumerConfig; import java.util.ArrayList; import java.util.List; import java.util.Objects; -import java.util.Properties; /** * An Utility which can incrementally consume data from Kafka and apply it to the target table. @@ -73,13 +75,21 @@ public static void main(String[] args) throws Exception { env.setStateBackend(new FsStateBackend(cfg.flinkCheckPointPath)); } - Properties kafkaProps = StreamerUtil.getKafkaProps(cfg); + TypedProperties props = StreamerUtil.getProps(cfg); + + // add kafka config + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, cfg.kafkaBootstrapServers); + props.put(ConsumerConfig.GROUP_ID_CONFIG, cfg.kafkaGroupId); + + // add data source config + props.put(HoodieWriteConfig.WRITE_PAYLOAD_CLASS, cfg.payloadClassName); + props.put(HoodieWriteConfig.PRECOMBINE_FIELD_PROP, cfg.sourceOrderingField); // Read from kafka source DataStream inputRecords = - env.addSource(new FlinkKafkaConsumer<>(cfg.kafkaTopic, new SimpleStringSchema(), kafkaProps)) + env.addSource(new FlinkKafkaConsumer<>(cfg.kafkaTopic, new SimpleStringSchema(), props)) .filter(Objects::nonNull) - .map(new JsonStringToHoodieRecordMapFunction(cfg)) + .map(new JsonStringToHoodieRecordMapFunction(props)) .name("kafka_to_hudi_record") .uid("kafka_to_hudi_record_uid"); diff --git a/hudi-flink/src/main/java/org/apache/hudi/source/JsonStringToHoodieRecordMapFunction.java b/hudi-flink/src/main/java/org/apache/hudi/source/JsonStringToHoodieRecordMapFunction.java index a01a67dad70dd..4eda371b4a017 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/source/JsonStringToHoodieRecordMapFunction.java +++ b/hudi-flink/src/main/java/org/apache/hudi/source/JsonStringToHoodieRecordMapFunction.java @@ -18,18 +18,21 @@ package org.apache.hudi.source; -import org.apache.hudi.HoodieFlinkStreamer; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieFlinkStreamerException; +import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; +import org.apache.hudi.common.util.Option; import org.apache.hudi.keygen.KeyGenerator; import org.apache.hudi.keygen.SimpleAvroKeyGenerator; import org.apache.hudi.schema.FilebasedSchemaProvider; import org.apache.hudi.util.AvroConvertor; import org.apache.hudi.util.StreamerUtil; +import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.functions.MapFunction; @@ -40,32 +43,46 @@ */ public class JsonStringToHoodieRecordMapFunction implements MapFunction { - private final HoodieFlinkStreamer.Config cfg; + private TypedProperties props; private KeyGenerator keyGenerator; private AvroConvertor avroConvertor; + private Option schemaStr; + private String payloadClassName; + private String orderingField; - public JsonStringToHoodieRecordMapFunction(HoodieFlinkStreamer.Config cfg) { - this.cfg = cfg; + public JsonStringToHoodieRecordMapFunction(TypedProperties props) { + this(props, Option.empty()); + } + + public JsonStringToHoodieRecordMapFunction(TypedProperties props, Option schemaStr) { + this.props = props; + this.schemaStr = schemaStr; init(); } @Override public HoodieRecord map(String value) throws Exception { - GenericRecord gr = avroConvertor.fromJson(value); - HoodieRecordPayload payload = StreamerUtil.createPayload(cfg.payloadClassName, gr, - (Comparable) HoodieAvroUtils.getNestedFieldVal(gr, cfg.sourceOrderingField, false)); + GenericRecord gr = this.avroConvertor.fromJson(value); + HoodieRecordPayload payload = StreamerUtil.createPayload(this.payloadClassName, gr, + (Comparable) HoodieAvroUtils.getNestedFieldVal(gr, this.orderingField, false)); - return new HoodieRecord<>(keyGenerator.getKey(gr), payload); + return new HoodieRecord<>(this.keyGenerator.getKey(gr), payload); } private void init() { - TypedProperties props = StreamerUtil.getProps(cfg); - avroConvertor = new AvroConvertor(new FilebasedSchemaProvider(props).getSourceSchema()); + if (schemaStr.isPresent()) { + this.avroConvertor = new AvroConvertor(new Schema.Parser().parse(schemaStr.get())); + } else { + this.avroConvertor = new AvroConvertor(new FilebasedSchemaProvider(props).getSourceSchema()); + } + this.payloadClassName = props.getString(HoodieWriteConfig.WRITE_PAYLOAD_CLASS, + OverwriteWithLatestAvroPayload.class.getName()); + this.orderingField = props.getString(HoodieWriteConfig.PRECOMBINE_FIELD_PROP, "ts"); try { - keyGenerator = StreamerUtil.createKeyGenerator(props); + this.keyGenerator = StreamerUtil.createKeyGenerator(props); } catch (IOException e) { throw new HoodieFlinkStreamerException(String.format("KeyGenerator %s initialization failed", - props.getString("hoodie.datasource.write.keygenerator.class", SimpleAvroKeyGenerator.class.getName())), e); + props.getString(HoodieWriteConfig.KEYGENERATOR_CLASS_PROP, SimpleAvroKeyGenerator.class.getName())), e); } } } diff --git a/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java b/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java index 2905a88afcf38..520527b581b04 100644 --- a/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java +++ b/hudi-flink/src/main/java/org/apache/hudi/util/StreamerUtil.java @@ -38,7 +38,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.kafka.clients.consumer.ConsumerConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,19 +45,11 @@ import java.io.IOException; import java.io.StringReader; import java.util.List; -import java.util.Properties; public class StreamerUtil { private static Logger LOG = LoggerFactory.getLogger(StreamerUtil.class); - public static Properties getKafkaProps(HoodieFlinkStreamer.Config cfg) { - Properties result = new Properties(); - result.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, cfg.kafkaBootstrapServers); - result.put(ConsumerConfig.GROUP_ID_CONFIG, cfg.kafkaGroupId); - return result; - } - public static TypedProperties getProps(HoodieFlinkStreamer.Config cfg) { return readConfig( FSUtils.getFs(cfg.propsFilePath, getHadoopConf()), diff --git a/hudi-flink/src/test/java/org/apache/hudi/source/TestJsonStringToHoodieRecordMapFunction.java b/hudi-flink/src/test/java/org/apache/hudi/source/TestJsonStringToHoodieRecordMapFunction.java new file mode 100644 index 0000000000000..af5b75502b587 --- /dev/null +++ b/hudi-flink/src/test/java/org/apache/hudi/source/TestJsonStringToHoodieRecordMapFunction.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.source; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; +import org.apache.hudi.common.testutils.RawTripTestPayload; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; +import org.apache.hudi.testutils.HoodieFlinkClientTestHarness; + +import org.apache.avro.Schema; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.AVRO_SCHEMA; + +public class TestJsonStringToHoodieRecordMapFunction extends HoodieFlinkClientTestHarness { + @BeforeEach + public void init() { + initPath(); + initTestDataGenerator(); + initFileSystem(); + initFlinkMiniCluster(); + } + + @AfterEach + public void clean() throws Exception { + cleanupTestDataGenerator(); + cleanupFileSystem(); + cleanupFlinkMiniCluster(); + } + + @Test + public void testMapFunction() throws Exception { + final String newCommitTime = "001"; + final int numRecords = 10; + List records = dataGen.generateInserts(newCommitTime, numRecords); + List recordStr = RawTripTestPayload.recordsToStrings(records); + Schema schema = AVRO_SCHEMA; + + TypedProperties props = new TypedProperties(); + props.put(HoodieWriteConfig.WRITE_PAYLOAD_CLASS, OverwriteWithLatestAvroPayload.class.getName()); + props.put(HoodieWriteConfig.PRECOMBINE_FIELD_PROP, "timestamp"); + props.put(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key"); + props.put(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY, "partitionPath"); + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(2); + + SimpleTestSinkFunction.valuesList.clear(); + env.fromCollection(recordStr) + .map(new JsonStringToHoodieRecordMapFunction(props, Option.of(schema.toString()))) + .addSink(new SimpleTestSinkFunction()); + env.execute(); + + // input records all present in the sink + Assertions.assertEquals(10, SimpleTestSinkFunction.valuesList.size()); + + // input keys all present in the sink + Set inputKeySet = records.stream().map(r -> r.getKey().getRecordKey()).collect(Collectors.toSet()); + Assertions.assertEquals(10, SimpleTestSinkFunction.valuesList.stream() + .map(r -> inputKeySet.contains(r.getRecordKey())).filter(b -> b).count()); + } +} diff --git a/hudi-flink/src/test/resources/log4j-surefire-quiet.properties b/hudi-flink/src/test/resources/log4j-surefire-quiet.properties new file mode 100644 index 0000000000000..2b94ea2903067 --- /dev/null +++ b/hudi-flink/src/test/resources/log4j-surefire-quiet.properties @@ -0,0 +1,30 @@ +### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +### +log4j.rootLogger=WARN, CONSOLE +log4j.logger.org.apache.hudi=DEBUG +log4j.logger.org.apache.hadoop.hbase=ERROR + +# CONSOLE is set to be a ConsoleAppender. +log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender +# CONSOLE uses PatternLayout. +log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout +log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n +log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter +log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true +log4j.appender.CONSOLE.filter.a.LevelMin=WARN +log4j.appender.CONSOLE.filter.a.LevelMax=FATAL \ No newline at end of file diff --git a/hudi-flink/src/test/resources/log4j-surefire.properties b/hudi-flink/src/test/resources/log4j-surefire.properties new file mode 100644 index 0000000000000..32af462093ae5 --- /dev/null +++ b/hudi-flink/src/test/resources/log4j-surefire.properties @@ -0,0 +1,31 @@ +### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +### +log4j.rootLogger=WARN, CONSOLE +log4j.logger.org.apache=INFO +log4j.logger.org.apache.hudi=DEBUG +log4j.logger.org.apache.hadoop.hbase=ERROR + +# A1 is set to be a ConsoleAppender. +log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender +# A1 uses PatternLayout. +log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout +log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n +log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter +log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true +log4j.appender.CONSOLE.filter.a.LevelMin=WARN +log4j.appender.CONSOLE.filter.a.LevelMax=FATAL diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala index 0a335839a8b84..f0974977e4a4b 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala @@ -20,6 +20,7 @@ package org.apache.hudi import org.apache.hudi.common.model.HoodieTableType import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload import org.apache.hudi.common.model.WriteOperationType +import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.hive.HiveSyncTool import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor import org.apache.hudi.keygen.SimpleKeyGenerator @@ -202,14 +203,14 @@ object DataSourceWriteOptions { * key value, we will pick the one with the largest value for the precombine field, * determined by Object.compareTo(..) */ - val PRECOMBINE_FIELD_OPT_KEY = "hoodie.datasource.write.precombine.field" + val PRECOMBINE_FIELD_OPT_KEY = HoodieWriteConfig.PRECOMBINE_FIELD_PROP val DEFAULT_PRECOMBINE_FIELD_OPT_VAL = "ts" /** * Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. * This will render any value set for `PRECOMBINE_FIELD_OPT_VAL` in-effective */ - val PAYLOAD_CLASS_OPT_KEY = "hoodie.datasource.write.payload.class" + val PAYLOAD_CLASS_OPT_KEY = HoodieWriteConfig.WRITE_PAYLOAD_CLASS val DEFAULT_PAYLOAD_OPT_VAL = classOf[OverwriteWithLatestAvroPayload].getName /** @@ -241,7 +242,7 @@ object DataSourceWriteOptions { * Key generator class, that implements will extract the key out of incoming record * */ - val KEYGENERATOR_CLASS_OPT_KEY = "hoodie.datasource.write.keygenerator.class" + val KEYGENERATOR_CLASS_OPT_KEY = HoodieWriteConfig.KEYGENERATOR_CLASS_PROP val DEFAULT_KEYGENERATOR_CLASS_OPT_VAL = classOf[SimpleKeyGenerator].getName /**