-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-1349]spark sql support overwrite use replace action #2196
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.hudi.table.action.commit; | ||
|
|
||
| import org.apache.hudi.client.WriteStatus; | ||
| import org.apache.hudi.client.common.HoodieEngineContext; | ||
| import org.apache.hudi.client.common.HoodieSparkEngineContext; | ||
| import org.apache.hudi.common.fs.FSUtils; | ||
| import org.apache.hudi.common.model.HoodieRecord; | ||
| import org.apache.hudi.common.model.HoodieRecordPayload; | ||
| import org.apache.hudi.common.model.WriteOperationType; | ||
| import org.apache.hudi.config.HoodieWriteConfig; | ||
| import org.apache.hudi.exception.HoodieCommitException; | ||
| import org.apache.hudi.table.HoodieTable; | ||
| import org.apache.spark.api.java.JavaRDD; | ||
| import org.apache.spark.api.java.JavaSparkContext; | ||
| import scala.Tuple2; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| public class SparkInsertOverwriteTableCommitActionExecutor<T extends HoodieRecordPayload<T>> | ||
| extends SparkInsertOverwriteCommitActionExecutor<T> { | ||
|
|
||
| public SparkInsertOverwriteTableCommitActionExecutor(HoodieEngineContext context, | ||
| HoodieWriteConfig config, HoodieTable table, | ||
| String instantTime, JavaRDD<HoodieRecord<T>> inputRecordsRDD) { | ||
| super(context, config, table, instantTime, inputRecordsRDD, WriteOperationType.INSERT_OVERWRITE_TABLE); | ||
| } | ||
|
|
||
| protected List<String> getAllExistingFileIds(String partitionPath) { | ||
| return table.getSliceView().getLatestFileSlices(partitionPath) | ||
| .map(fg -> fg.getFileId()).distinct().collect(Collectors.toList()); | ||
| } | ||
|
|
||
| @Override | ||
| protected Map<String, List<String>> getPartitionToReplacedFileIds(JavaRDD<WriteStatus> writeStatuses) { | ||
| Map<String, List<String>> partitionToExistingFileIds = new HashMap<>(); | ||
| try { | ||
| List<String> partitionPaths = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), | ||
| table.getMetaClient().getBasePath(), false); | ||
| JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); | ||
| if (partitionPaths != null && partitionPaths.size() > 0) { | ||
| context.setJobStatus(this.getClass().getSimpleName(), "Getting ExistingFileIds of all partitions"); | ||
| JavaRDD<String> partitionPathRdd = jsc.parallelize(partitionPaths, partitionPaths.size()); | ||
| partitionToExistingFileIds = partitionPathRdd.mapToPair( | ||
| partitionPath -> new Tuple2<>(partitionPath, getAllExistingFileIds(partitionPath))).collectAsMap(); | ||
| } | ||
| } catch (IOException e) { | ||
| throw new HoodieCommitException("In InsertOverwriteTable action failed to get existing fileIds of all partition " | ||
| + config.getBasePath() + " at time " + instantTime, e); | ||
| } | ||
| return partitionToExistingFileIds; | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,13 @@ | |
| package org.apache.hudi.functional | ||
|
|
||
| import java.sql.{Date, Timestamp} | ||
| import java.util.function.Supplier | ||
| import java.util.stream.Stream | ||
|
|
||
| import org.apache.hadoop.fs.Path | ||
| import org.apache.hudi.common.table.HoodieTableMetaClient | ||
| import org.apache.hudi.common.table.timeline.HoodieInstant | ||
| import org.apache.hudi.common.testutils.HoodieTestDataGenerator | ||
| import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings | ||
| import org.apache.hudi.config.HoodieWriteConfig | ||
| import org.apache.hudi.testutils.HoodieClientTestBase | ||
|
|
@@ -156,6 +162,79 @@ class TestCOWDataSource extends HoodieClientTestBase { | |
| assertEquals(100, timeTravelDF.count()) // 100 initial inserts must be pulled | ||
| } | ||
|
|
||
| @Test def testOverWriteModeUseReplaceAction(): Unit = { | ||
| val records1 = recordsToStrings(dataGen.generateInserts("001", 5)).toList | ||
| val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) | ||
| inputDF1.write.format("org.apache.hudi") | ||
| .options(commonOpts) | ||
| .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) | ||
| .mode(SaveMode.Append) | ||
| .save(basePath) | ||
|
|
||
| val records2 = recordsToStrings(dataGen.generateInserts("002", 5)).toList | ||
| val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2)) | ||
| inputDF2.write.format("org.apache.hudi") | ||
| .options(commonOpts) | ||
| .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) | ||
| .mode(SaveMode.Overwrite) | ||
| .save(basePath) | ||
|
|
||
| val metaClient = new HoodieTableMetaClient(spark.sparkContext.hadoopConfiguration, basePath, true) | ||
| val commits = metaClient.getActiveTimeline.filterCompletedInstants().getInstants.toArray | ||
| .map(instant => (instant.asInstanceOf[HoodieInstant]).getAction) | ||
| assertEquals(2, commits.size) | ||
| assertEquals("commit", commits(0)) | ||
| assertEquals("replacecommit", commits(1)) | ||
| } | ||
|
|
||
| @Test def testOverWriteModeUseReplaceActionOnDisJointPartitions(): Unit = { | ||
| // step1: Write 5 records to hoodie table for partition1 DEFAULT_FIRST_PARTITION_PATH | ||
| val records1 = recordsToStrings(dataGen.generateInsertsForPartition("001", 5, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).toList | ||
| val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2)) | ||
| inputDF1.write.format("org.apache.hudi") | ||
| .options(commonOpts) | ||
| .option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL) | ||
| .mode(SaveMode.Append) | ||
| .save(basePath) | ||
|
|
||
| // step2: Write 7 more rectestOverWriteModeUseReplaceActionords using SaveMode.Overwrite for partition2 DEFAULT_SECOND_PARTITION_PATH | ||
| val records2 = recordsToStrings(dataGen.generateInsertsForPartition("002", 7, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)).toList | ||
| val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2)) | ||
| inputDF2.write.format("org.apache.hudi") | ||
| .options(commonOpts) | ||
| .mode(SaveMode.Overwrite) | ||
| .save(basePath) | ||
|
|
||
| val allRecords = spark.read.format("org.apache.hudi").load(basePath + "/*/*/*") | ||
| allRecords.registerTempTable("tmpTable") | ||
|
|
||
| spark.sql(String.format("select count(*) from tmpTable")).show() | ||
|
|
||
| // step3: Query the rows count from hoodie table for partition1 DEFAULT_FIRST_PARTITION_PATH | ||
| val recordCountForParititon1 = spark.sql(String.format("select count(*) from tmpTable where partition = '%s'", HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).collect() | ||
|
||
| assertEquals("0", recordCountForParititon1(0).get(0).toString) | ||
|
|
||
| // step4: Query the rows count from hoodie table for partition1 DEFAULT_SECOND_PARTITION_PATH | ||
| val recordCountForParititon2 = spark.sql(String.format("select count(*) from tmpTable where partition = '%s'", HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)).collect() | ||
| assertEquals("7", recordCountForParititon2(0).get(0).toString) | ||
|
|
||
| // step5: Query the rows count from hoodie table | ||
| val recordCount = spark.sql(String.format("select count(*) from tmpTable")).collect() | ||
| assertEquals("7", recordCountForParititon2(0).get(0).toString) | ||
|
|
||
| // step6: Query the rows count from hoodie table for partition1 DEFAULT_SECOND_PARTITION_PATH using spark.collect and then filter mode | ||
| val recordsForPartitionColumn = spark.sql(String.format("select partition from tmpTable")).collect() | ||
| val filterSecondPartitionCount = recordsForPartitionColumn.filter(row => row.get(0).equals(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)).size | ||
| assertEquals(7,filterSecondPartitionCount) | ||
|
|
||
| val metaClient = new HoodieTableMetaClient(spark.sparkContext.hadoopConfiguration, basePath, true) | ||
| val commits = metaClient.getActiveTimeline.filterCompletedInstants().getInstants.toArray | ||
| .map(instant => instant.asInstanceOf[HoodieInstant].getAction) | ||
| assertEquals(2, commits.size) | ||
| assertEquals("commit", commits(0)) | ||
| assertEquals("replacecommit", commits(1)) | ||
| } | ||
|
|
||
| @Test def testDropInsertDup(): Unit = { | ||
| val insert1Cnt = 10 | ||
| val insert2DupKeyCnt = 9 | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I ran a similar test using quick start setup guide:
`scala> val inserts = convertToStringList(dataGen.generateInserts(10))
inserts: java.util.List[String] = [{"ts": 0, "uuid": "299d5202-1ea0-4918-9d2f-2365bc1c2402", "rider": "rider-213", "driver": "driver-213", "begin_lat": 0.4726905879569653, "begin_lon": 0.46157858450465483, "end_lat": 0.754803407008858, "end_lon": 0.9671159942018241, "fare": 34.158284716382845, "partitionpath": "2020/03/11"}, {"ts": 0, "uuid": "0fc23a14-c815-4b09-bff1-c6193a6de5b7", "rider": "rider-213", "driver": "driver-213", "begin_lat": 0.6100070562136587, "begin_lon": 0.8779402295427752, "end_lat": 0.3407870505929602, "end_lon": 0.5030798142293655, "fare": 43.4923811219014, "partitionpath": "2020/03/11"}, {"ts": 0, "uuid": "7136e8f8-ed82-4fc4-b60d-f7367f7be791", "rider": "rider-213", "driver": "driver-213", "begin_lat": 0.5731835407930634, "begin_lon": 0.4923479652912024, "end_lat":...
scala> val df = spark.read.json(spark.sparkContext.parallelize(inserts, 1))
warning: there was one deprecation warning; re-run with -deprecation for details
df: org.apache.spark.sql.DataFrame = [begin_lat: double, begin_lon: double ... 8 more fields]
scala> df.write.format("org.apache.hudi").
| options(getQuickstartWriteConfigs).
| option(PRECOMBINE_FIELD_OPT_KEY, "ts").
| option(RECORDKEY_FIELD_OPT_KEY, "uuid").
| option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
| option(TABLE_NAME, tableName).
| mode(Overwrite).
| save(basePath);`
`scala> val tripsSnapshotDF = spark.
| read.
| format("org.apache.hudi").
| load(basePath + "////")
20/11/04 14:51:53 WARN DefaultSource: Loading Base File Only View.
tripsSnapshotDF: org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string, _hoodie_commit_seqno: string ... 13 more fields]
scala> tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot")
scala>
scala> spark.sql("select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from hudi_trips_snapshot").show()
+-------------------+--------------------+----------------------+---------+----------+------------------+
|_hoodie_commit_time| _hoodie_record_key|_hoodie_partition_path| rider| driver| fare|
+-------------------+--------------------+----------------------+---------+----------+------------------+
| 20201104145141|299d5202-1ea0-491...| 2020/03/11|rider-213|driver-213|34.158284716382845|
| 20201104145141|0fc23a14-c815-4b0...| 2020/03/11|rider-213|driver-213| 43.4923811219014|
| 20201104145141|7136e8f8-ed82-4fc...| 2020/03/11|rider-213|driver-213| 64.27696295884016|
| 20201104145141|5ffa488e-d75e-4ef...| 2020/03/11|rider-213|driver-213| 93.56018115236618|
| 20201104145141|cf09166f-dc3f-45e...| 2020/03/11|rider-213|driver-213|17.851135255091155|
| 20201104145141|6f522490-e29e-419...| 2020/03/11|rider-213|driver-213|19.179139106643607|
| 20201104145141|db97e3ef-ad7a-4e8...| 2020/03/11|rider-213|driver-213| 33.92216483948643|
| 20201104145141|a42d7c22-d0bf-4b9...| 2020/03/11|rider-213|driver-213| 66.62084366450246|
| 20201104145141|94154d3e-c3da-436...| 2020/03/11|rider-213|driver-213| 41.06290929046368|
| 20201104145141|618b3f38-bb71-402...| 2020/03/11|rider-213|driver-213| 27.79478688582596|
+-------------------+--------------------+----------------------+---------+----------+------------------+
`
`scala> val dataGen = new DataGenerator(Array("2020/09/11"))
dataGen: org.apache.hudi.QuickstartUtils.DataGenerator = org.apache.hudi.QuickstartUtils$DataGenerator@f00b18a
scala> val inserts2 = convertToStringList(dataGen.generateInserts(1))
scala> val df = spark.read.json(spark.sparkContext.parallelize(inserts2, 1))
scala> df.write.format("org.apache.hudi").
| options(getQuickstartWriteConfigs).
| option(PRECOMBINE_FIELD_OPT_KEY, "ts").
| option(RECORDKEY_FIELD_OPT_KEY, "uuid").
| option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
| option(TABLE_NAME, tableName).
| mode(Append).
| option(OPERATION_OPT_KEY, "insert_overwrite").
| save(basePath);
4. Query the data backscala> spark.sql("select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from hudi_trips_snapshot").show()+-------------------+--------------------+----------------------+---------+----------+------------------+
|_hoodie_commit_time| _hoodie_record_key|_hoodie_partition_path| rider| driver| fare|
+-------------------+--------------------+----------------------+---------+----------+------------------+
| 20201104145258|299d5202-1ea0-491...| 2020/03/11|rider-213|driver-213|34.158284716382845|
| 20201104145258|0fc23a14-c815-4b0...| 2020/03/11|rider-213|driver-213| 43.4923811219014|
| 20201104145258|7136e8f8-ed82-4fc...| 2020/03/11|rider-213|driver-213| 64.27696295884016|
| 20201104145258|5ffa488e-d75e-4ef...| 2020/03/11|rider-213|driver-213| 93.56018115236618|
| 20201104145258|cf09166f-dc3f-45e...| 2020/03/11|rider-213|driver-213|17.851135255091155|
| 20201104145258|6f522490-e29e-419...| 2020/03/11|rider-213|driver-213|19.179139106643607|
| 20201104145258|db97e3ef-ad7a-4e8...| 2020/03/11|rider-213|driver-213| 33.92216483948643|
| 20201104145258|a42d7c22-d0bf-4b9...| 2020/03/11|rider-213|driver-213| 66.62084366450246|
| 20201104145258|94154d3e-c3da-436...| 2020/03/11|rider-213|driver-213| 41.06290929046368|
| 20201104145258|618b3f38-bb71-402...| 2020/03/11|rider-213|driver-213| 27.79478688582596|
| 20201104145348|4dac8aa3-b8fa-410...| 2020/09/11|rider-284|driver-284|49.527694252432056|
+-------------------+--------------------+----------------------+---------+----------+------------------+
`
As you can see in step4, we see all 11 records. With 'SaveMode.Overwrite' we should only see 1 record. Hope this is clear.