From 6ddb4fc48f08408040645c5221faf683b97ad3cb Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Fri, 11 Sep 2020 17:19:38 +0800 Subject: [PATCH 01/24] t --- .../java/org/apache/hudi/table/action/commit/WriteHelper.java | 1 + 1 file changed, 1 insertion(+) diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java index 92dcbb628ad15..5bba0cf934d10 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java @@ -74,6 +74,7 @@ public static > JavaRDD> combin return condition ? deduplicateRecords(records, table, parallelism) : records; } + /** * Deduplicate Hoodie records, using the given deduplication function. * From 21f0d89d0b9d222d3095bb666cce6dae89e994c0 Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Fri, 11 Sep 2020 17:26:52 +0800 Subject: [PATCH 02/24] Description: feat: We met performance issue with foo module recently, foo handles client requests one by one so we just use gevent for concurrency. Jira case id: sss --- .../java/org/apache/hudi/table/action/commit/WriteHelper.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java index 5bba0cf934d10..92dcbb628ad15 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java @@ -74,7 +74,6 @@ public static > JavaRDD> combin return condition ? deduplicateRecords(records, table, parallelism) : records; } - /** * Deduplicate Hoodie records, using the given deduplication function. * From 835fce867dd4c165165d5a96703720b9957062d0 Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Sun, 20 Sep 2020 17:52:20 +0800 Subject: [PATCH 03/24] feat: update preCombine --- .../apache/hudi/client/HoodieWriteClient.java | 16 +++++++-- .../bootstrap/BootstrapRecordPayload.java | 7 ++++ .../apache/hudi/config/HoodieWriteConfig.java | 8 +++++ .../hudi/table/HoodieCopyOnWriteTable.java | 5 +-- .../hudi/table/HoodieMergeOnReadTable.java | 5 +-- .../org/apache/hudi/table/HoodieTable.java | 2 +- .../table/action/commit/BulkInsertHelper.java | 2 +- .../commit/UpsertCommitActionExecutor.java | 17 ++++++++-- .../hudi/table/action/commit/WriteHelper.java | 33 ++++++++++++++----- .../UpsertDeltaCommitActionExecutor.java | 17 ++++++++-- .../TestHoodieClientOnCopyOnWriteStorage.java | 5 +-- .../apache/hudi/common/HoodieJsonPayload.java | 5 +++ .../hudi/common/model/BaseAvroPayload.java | 2 +- .../model/EmptyHoodieRecordPayload.java | 7 ++++ .../hudi/common/model/HoodieAvroPayload.java | 5 +++ .../common/model/HoodieRecordPayload.java | 16 +++++++++ .../model/OverwriteWithLatestAvroPayload.java | 33 +++++++++++++++++++ .../testutils/AvroBinaryTestPayload.java | 5 +++ .../common/testutils/RawTripTestPayload.java | 5 +++ .../java/org/apache/hudi/DataSourceUtils.java | 4 +-- .../apache/hudi/HoodieSparkSqlWriter.scala | 3 +- .../org/apache/hudi/TestDataSourceUtils.java | 6 ++-- 22 files changed, 177 insertions(+), 31 deletions(-) diff --git a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java index 142ff330a87a9..3e741701c4192 100644 --- a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java +++ b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java @@ -190,7 +190,19 @@ public JavaRDD upsert(JavaRDD> records, final Strin table.validateUpsertSchema(); setOperationType(WriteOperationType.UPSERT); this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata result = table.upsert(jsc, instantTime, records); + HoodieWriteMetadata result = table.upsert(jsc, instantTime, records, null); + if (result.getIndexLookupDuration().isPresent()) { + metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); + } + return postWrite(result, instantTime, table); + } + + public JavaRDD upsert(JavaRDD> records, final String instantTime, String schema) { + HoodieTable table = getTableAndInitCtx(WriteOperationType.UPSERT, instantTime); + table.validateUpsertSchema(); + setOperationType(WriteOperationType.UPSERT); + this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); + HoodieWriteMetadata result = table.upsert(jsc, instantTime, records, schema); if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); } @@ -198,7 +210,7 @@ public JavaRDD upsert(JavaRDD> records, final Strin } /** - * Upserts the given prepared records into the Hoodie table, at the supplied instantTime. + * Upserts the given prepared records into the Hoodie table,/TestHoodieClientOnCopyOnWriteStorage at the supplied instantTime. *

* This implementation requires that the input records are already tagged, and de-duped if needed. * diff --git a/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java b/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java index fa508e42f120c..7d66b61ae36e6 100644 --- a/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java +++ b/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java @@ -25,6 +25,8 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; +import java.io.IOException; + public class BootstrapRecordPayload implements HoodieRecordPayload { private final GenericRecord record; @@ -38,6 +40,11 @@ public BootstrapRecordPayload preCombine(BootstrapRecordPayload another) { return this; } + @Override + public BootstrapRecordPayload preCombine(BootstrapRecordPayload another, Schema schema) throws IOException { + return this; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) { return Option.ofNullable(record); diff --git a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index 089474d15c2e8..5b00d9d049834 100644 --- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -82,6 +82,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { public static final String DEFAULT_COMBINE_BEFORE_UPSERT = "true"; public static final String COMBINE_BEFORE_DELETE_PROP = "hoodie.combine.before.delete"; public static final String DEFAULT_COMBINE_BEFORE_DELETE = "true"; + public static final String COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP = "hoodie.combine.all.fields.before.upsert"; + public static final String DEFAULT_COMBINE_ALL_FIELDS_BEFORE_UPSERT = "false"; public static final String WRITE_STATUS_STORAGE_LEVEL = "hoodie.write.status.storage.level"; public static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; public static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit"; @@ -236,6 +238,10 @@ public boolean shouldCombineBeforeDelete() { return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_DELETE_PROP)); } + public boolean shouldCombineAllFieldsBeforeUpsert() { + return Boolean.parseBoolean(props.getProperty(COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP)); + } + public boolean shouldAllowMultiWriteOnSameInstant() { return Boolean.parseBoolean(props.getProperty(ALLOW_MULTI_WRITE_ON_SAME_INSTANT)); } @@ -987,6 +993,8 @@ protected void setDefaults() { DEFAULT_COMBINE_BEFORE_UPSERT); setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_DELETE_PROP), COMBINE_BEFORE_DELETE_PROP, DEFAULT_COMBINE_BEFORE_DELETE); + setDefaultOnCondition(props, !props.containsKey(COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP), + COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP, DEFAULT_COMBINE_ALL_FIELDS_BEFORE_UPSERT); setDefaultOnCondition(props, !props.containsKey(ALLOW_MULTI_WRITE_ON_SAME_INSTANT), ALLOW_MULTI_WRITE_ON_SAME_INSTANT, DEFAULT_ALLOW_MULTI_WRITE_ON_SAME_INSTANT); setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), WRITE_STATUS_STORAGE_LEVEL, diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java index 31ba537fa2ce6..20d1fe32fe0f8 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java @@ -83,8 +83,9 @@ public HoodieCopyOnWriteTable(HoodieWriteConfig config, Configuration hadoopConf } @Override - public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records) { - return new UpsertCommitActionExecutor<>(jsc, config, this, instantTime, records).execute(); + public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records, + String schema) { + return new UpsertCommitActionExecutor<>(jsc, config, this, instantTime, records, schema).execute(); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java index a236cdb9411de..a792af8c06bb5 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java @@ -76,8 +76,9 @@ public class HoodieMergeOnReadTable extends Hoodi } @Override - public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records) { - return new UpsertDeltaCommitActionExecutor<>(jsc, config, this, instantTime, records).execute(); + public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records, + String schema) { + return new UpsertDeltaCommitActionExecutor<>(jsc, config, this, instantTime, records, schema).execute(); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java index 71bcb31314a81..0973dc47dd9d6 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -142,7 +142,7 @@ public static HoodieTable create(HoodieTableM * @return HoodieWriteMetadata */ public abstract HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, - JavaRDD> records); + JavaRDD> records, String schema); /** * Insert a batch of new records into Hoodie table at the supplied instantTime. diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertHelper.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertHelper.java index 4683c8218d782..53bd99464a698 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertHelper.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertHelper.java @@ -50,7 +50,7 @@ public static > HoodieWriteMetadata bulkInsert( if (performDedupe) { dedupedRecords = WriteHelper.combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, - config.getBulkInsertShuffleParallelism(), ((HoodieTable)table)); + config.getBulkInsertShuffleParallelism(), ((HoodieTable)table), false, null); } final JavaRDD> repartitionedRecords; diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java index 0c4d08e35ac21..7a6347e0c0a02 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java @@ -32,10 +32,20 @@ public class UpsertCommitActionExecutor> extends CommitActionExecutor { private JavaRDD> inputRecordsRDD; + private String schema; public UpsertCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD, + String schema) { + super(jsc, config, table, instantTime, WriteOperationType.UPSERT); + this.inputRecordsRDD = inputRecordsRDD; + this.schema = schema; + } + + public UpsertCommitActionExecutor(JavaSparkContext jsc, + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD) { super(jsc, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; } @@ -43,6 +53,7 @@ public UpsertCommitActionExecutor(JavaSparkContext jsc, @Override public HoodieWriteMetadata execute() { return WriteHelper.write(instantTime, inputRecordsRDD, jsc, (HoodieTable)table, - config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), this, true); + config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), + config.shouldCombineAllFieldsBeforeUpsert(), schema, this, true); } } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java index 92dcbb628ad15..15545ec7162f2 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java @@ -18,6 +18,7 @@ package org.apache.hudi.table.action.commit; +import org.apache.avro.Schema; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -38,11 +39,23 @@ public class WriteHelper> { public static > HoodieWriteMetadata write(String instantTime, JavaRDD> inputRecordsRDD, JavaSparkContext jsc, HoodieTable table, boolean shouldCombine, - int shuffleParallelism, CommitActionExecutor executor, boolean performTagging) { + int shuffleParallelism, CommitActionExecutor executor, + boolean performTagging) { + return write(instantTime, inputRecordsRDD, jsc, table, shouldCombine, shuffleParallelism,false, + null, executor, performTagging); + } + + public static > HoodieWriteMetadata write(String instantTime, + JavaRDD> inputRecordsRDD, JavaSparkContext jsc, + HoodieTable table, boolean shouldCombine, + int shuffleParallelism,boolean precombineAgg, + String schema, CommitActionExecutor executor, + boolean performTagging) { try { // De-dupe/merge if needed JavaRDD> dedupedRecords = - combineOnCondition(shouldCombine, inputRecordsRDD, shuffleParallelism, table); + combineOnCondition(shouldCombine, inputRecordsRDD, shuffleParallelism, table, precombineAgg, schema); + Instant lookupBegin = Instant.now(); JavaRDD> taggedRecords = dedupedRecords; @@ -70,8 +83,9 @@ private static > JavaRDD> tag( } public static > JavaRDD> combineOnCondition( - boolean condition, JavaRDD> records, int parallelism, HoodieTable table) { - return condition ? deduplicateRecords(records, table, parallelism) : records; + boolean condition, JavaRDD> records, int parallelism, HoodieTable table, + boolean precombineAgg, String schema) { + return condition ? deduplicateRecords(records, table, parallelism, precombineAgg, schema) : records; } /** @@ -82,12 +96,14 @@ public static > JavaRDD> combin * @return RDD of HoodieRecord already be deduplicated */ public static > JavaRDD> deduplicateRecords( - JavaRDD> records, HoodieTable table, int parallelism) { - return deduplicateRecords(records, table.getIndex(), parallelism); + JavaRDD> records, HoodieTable table, int parallelism, boolean precombineAgg, + String schema) { + return deduplicateRecords(records, table.getIndex(), parallelism, precombineAgg, schema); } public static > JavaRDD> deduplicateRecords( - JavaRDD> records, HoodieIndex index, int parallelism) { + JavaRDD> records, HoodieIndex index, int parallelism, boolean precombineAgg, + String schema) { boolean isIndexingGlobal = index.isGlobal(); return records.mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); @@ -96,7 +112,8 @@ public static > JavaRDD> dedupl return new Tuple2<>(key, record); }).reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") - T reducedData = (T) rec1.getData().preCombine(rec2.getData()); + T reducedData = precombineAgg && schema != null ? (T) rec1.getData().preCombine(rec2.getData(),new Schema.Parser().parse(schema)) + : (T) rec1.getData().preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java index 1809078cb85e3..88c95155aa9ab 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java @@ -33,10 +33,20 @@ public class UpsertDeltaCommitActionExecutor> extends DeltaCommitActionExecutor { private JavaRDD> inputRecordsRDD; + private String schema; public UpsertDeltaCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD, + String schema) { + super(jsc, config, table, instantTime, WriteOperationType.UPSERT); + this.inputRecordsRDD = inputRecordsRDD; + this.schema = schema; + } + + public UpsertDeltaCommitActionExecutor(JavaSparkContext jsc, + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD) { super(jsc, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; } @@ -44,6 +54,7 @@ public UpsertDeltaCommitActionExecutor(JavaSparkContext jsc, @Override public HoodieWriteMetadata execute() { return WriteHelper.write(instantTime, inputRecordsRDD, jsc, (HoodieTable) table, - config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(),this, true); + config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), + config.shouldCombineAllFieldsBeforeUpsert(),schema, this, true); } } diff --git a/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java index ba4ffb4731e23..5ced1c343865c 100644 --- a/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java @@ -231,14 +231,15 @@ private void testDeduplication( // Global dedup should be done based on recordKey only HoodieIndex index = mock(HoodieIndex.class); when(index.isGlobal()).thenReturn(true); - List> dedupedRecs = WriteHelper.deduplicateRecords(records, index, 1).collect(); + List> dedupedRecs = WriteHelper.deduplicateRecords(records, index, 1, + false, null).collect(); assertEquals(1, dedupedRecs.size()); assertNodupesWithinPartition(dedupedRecs); // non-Global dedup should be done based on both recordKey and partitionPath index = mock(HoodieIndex.class); when(index.isGlobal()).thenReturn(false); - dedupedRecs = WriteHelper.deduplicateRecords(records, index, 1).collect(); + dedupedRecs = WriteHelper.deduplicateRecords(records, index, 1, false, null).collect(); assertEquals(2, dedupedRecs.size()); assertNodupesWithinPartition(dedupedRecs); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java index 1c15c66410e50..efd2a68bf71ff 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java @@ -54,6 +54,11 @@ public HoodieJsonPayload preCombine(HoodieJsonPayload another) { return this; } + @Override + public HoodieJsonPayload preCombine(HoodieJsonPayload another, Schema schema) throws IOException { + return this; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java index 3b35b0d4dca16..72f129f6c7e3a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java @@ -32,7 +32,7 @@ public abstract class BaseAvroPayload implements Serializable { /** * Avro data extracted from the source converted to bytes. */ - public final byte[] recordBytes; + public byte[] recordBytes; /** * For purposes of preCombining. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java index 783422fc648f2..9639683a80324 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java @@ -24,6 +24,8 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; +import java.io.IOException; + /** * Empty payload used for deletions. */ @@ -40,6 +42,11 @@ public EmptyHoodieRecordPayload preCombine(EmptyHoodieRecordPayload another) { return another; } + @Override + public EmptyHoodieRecordPayload preCombine(EmptyHoodieRecordPayload another, Schema schema) throws IOException { + return another; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) { return Option.empty(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java index a3ab2b71ae980..ed454c821d689 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java @@ -50,6 +50,11 @@ public HoodieAvroPayload preCombine(HoodieAvroPayload another) { return this; } + @Override + public HoodieAvroPayload preCombine(HoodieAvroPayload another, Schema schema) throws IOException { + return this; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java index 1afdd1b59af64..fe529d61cfcbf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java @@ -44,6 +44,22 @@ public interface HoodieRecordPayload extends Seri @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) T preCombine(T another); + /** + * When more than one HoodieRecord have the same HoodieKey, this function combines all fields(which is not null) + * before attempting to insert/upsert (if combining turned on in HoodieClientConfig). + * eg: 1) + * Before: + * id name age ts + * 1 Karl null 0.0 + * 1 null 18 0.0 + * After: + * id name age ts + * 1 Karl 18 0.0 + * + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + T preCombine(T another, Schema schema) throws IOException; + /** * This methods lets you write custom merging/combining logic to produce new values as a function of current value on * storage and whats contained in this object. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java index 845967c00ebfb..4fa61ce8ff0d4 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java @@ -26,6 +26,7 @@ import org.apache.avro.generic.IndexedRecord; import java.io.IOException; +import java.util.List; /** * Default payload used for delta streamer. @@ -47,6 +48,26 @@ public OverwriteWithLatestAvroPayload(Option record) { this(record.isPresent() ? record.get() : null, (record1) -> 0); // natural order } + @Override + public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another,Schema schema) throws IOException { + // pick the payload with greatest ordering value and aggregate all the fields,choosing the + // value that is not null + GenericRecord thisValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(this.recordBytes, schema); + GenericRecord anotherValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(another.recordBytes,schema); + List fields = schema.getFields(); + + if (another.orderingVal.compareTo(orderingVal) > 0) { + GenericRecord anotherRoc = combineAllFields(fields,anotherValue,thisValue); + another.recordBytes = HoodieAvroUtils.avroToBytes(anotherRoc); + return another; + } else { + GenericRecord thisRoc = combineAllFields(fields,thisValue,anotherValue); + this.recordBytes = HoodieAvroUtils.avroToBytes(thisRoc); + return this; + } + + } + @Override public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another) { // pick the payload with greatest ordering value @@ -57,6 +78,18 @@ public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload } } + public GenericRecord combineAllFields(List fields,GenericRecord priorRec,GenericRecord inferiorRoc) { + for (int i = 0; i < fields.size(); i++) { + Object priorValue = priorRec.get(fields.get(i).name()); + Object inferiorValue = inferiorRoc.get(fields.get(i).name()); + Object defaultVal = fields.get(i).defaultVal(); + if (overwriteField(priorValue,defaultVal) && !overwriteField(inferiorValue,defaultVal)) { + priorRec.put(fields.get(i).name(), inferiorValue); + } + } + return priorRec; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java index ff862ee7b7f7f..a45f196017a14 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java @@ -48,6 +48,11 @@ public HoodieRecordPayload preCombine(HoodieRecordPayload another) { return this; } + @Override + public HoodieRecordPayload preCombine(HoodieRecordPayload another, Schema schema) throws IOException { + return this; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java index 8442aff084a49..d44c2d56fa634 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java @@ -103,6 +103,11 @@ public RawTripTestPayload preCombine(RawTripTestPayload another) { return another; } + @Override + public RawTripTestPayload preCombine(RawTripTestPayload another, Schema schema) throws IOException { + return another; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { return this.getInsertValue(schema); diff --git a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java index cbc5b030c30df..73c51234d08a4 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -175,7 +175,7 @@ public static HoodieWriteClient createHoodieClient(JavaSparkContext jssc, String } public static JavaRDD doWriteOperation(HoodieWriteClient client, JavaRDD hoodieRecords, - String instantTime, WriteOperationType operation) throws HoodieException { + String instantTime, WriteOperationType operation, String schema) throws HoodieException { switch (operation) { case BULK_INSERT: Option userDefinedBulkInsertPartitioner = @@ -184,7 +184,7 @@ public static JavaRDD doWriteOperation(HoodieWriteClient client, Ja case INSERT: return client.insert(hoodieRecords, instantTime); case UPSERT: - return client.upsert(hoodieRecords, instantTime); + return client.upsert(hoodieRecords, instantTime, schema); default: throw new HoodieException("Not a valid operation type for doWriteOperation: " + operation.toString()); } diff --git a/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index 569ed345f1982..2636481fa208d 100644 --- a/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -164,7 +164,8 @@ private[hudi] object HoodieSparkSqlWriter { (true, common.util.Option.empty()) } client.startCommitWithTime(instantTime) - val writeStatuses = DataSourceUtils.doWriteOperation(client, hoodieRecords, instantTime, operation) + val writeStatuses = DataSourceUtils.doWriteOperation(client, hoodieRecords, instantTime, operation + , schema.toString()) (writeStatuses, client) } else { val structName = s"${tblName}_record" diff --git a/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java b/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java index 9ff114e46431d..279df0d2e3e62 100644 --- a/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java +++ b/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java @@ -131,7 +131,7 @@ public void testDoWriteOperationWithoutUserDefinedBulkInsertPartitioner() throws when(hoodieWriteClient.getConfig()).thenReturn(config); DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT); + WriteOperationType.BULK_INSERT, null); verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), optionCaptor.capture()); @@ -144,7 +144,7 @@ public void testDoWriteOperationWithNonExistUserDefinedBulkInsertPartitioner() t Exception exception = assertThrows(HoodieException.class, () -> { DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT); + WriteOperationType.BULK_INSERT, null); }); assertThat(exception.getMessage(), containsString("Could not create UserDefinedBulkInsertPartitioner")); @@ -155,7 +155,7 @@ public void testDoWriteOperationWithUserDefinedBulkInsertPartitioner() throws Ho setAndVerifyHoodieWriteClientWith(NoOpBulkInsertPartitioner.class.getName()); DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT); + WriteOperationType.BULK_INSERT, null); verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), optionCaptor.capture()); From b5486786b5a0807044f5be0d4ab8b121450ff998 Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Sun, 20 Sep 2020 18:21:23 +0800 Subject: [PATCH 04/24] add test --- .../model/TestOverwriteWithLatestAvroPayload.java | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java index 7c5951a7cac04..3a13237af8bee 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java @@ -22,6 +22,7 @@ import org.apache.avro.Schema.Type; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.avro.HoodieAvroUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -42,7 +43,7 @@ public class TestOverwriteWithLatestAvroPayload { public void setUp() throws Exception { schema = Schema.createRecord(Arrays.asList( new Schema.Field("id", Schema.create(Schema.Type.STRING), "", null), - new Schema.Field("partition", Schema.create(Schema.Type.STRING), "", null), + new Schema.Field("partition", Schema.create(Schema.Type.STRING), "", ""), new Schema.Field("ts", Schema.create(Schema.Type.LONG), "", null), new Schema.Field("_hoodie_is_deleted", Schema.create(Type.BOOLEAN), "", false) )); @@ -58,12 +59,19 @@ public void testActiveRecords() throws IOException { GenericRecord record2 = new GenericData.Record(schema); record2.put("id", "2"); - record2.put("partition", "partition1"); + record2.put("partition", ""); record2.put("ts", 1L); record2.put("_hoodie_is_deleted", false); + GenericRecord record3 = new GenericData.Record(schema); + record3.put("id", "2"); + record3.put("partition", "partition0"); + record3.put("ts", 1L); + record3.put("_hoodie_is_deleted", false); + OverwriteWithLatestAvroPayload payload1 = new OverwriteWithLatestAvroPayload(record1, 1); OverwriteWithLatestAvroPayload payload2 = new OverwriteWithLatestAvroPayload(record2, 2); + OverwriteWithLatestAvroPayload payload3 = new OverwriteWithLatestAvroPayload(record3, 2); assertEquals(payload1.preCombine(payload2), payload2); assertEquals(payload2.preCombine(payload1), payload2); @@ -72,6 +80,9 @@ public void testActiveRecords() throws IOException { assertEquals(payload1.combineAndGetUpdateValue(record2, schema).get(), record1); assertEquals(payload2.combineAndGetUpdateValue(record1, schema).get(), record2); + + assertEquals(HoodieAvroUtils.bytesToAvro(payload1.preCombine(payload2, schema).recordBytes,schema), + HoodieAvroUtils.bytesToAvro(payload3.recordBytes, schema)); } @Test From e6efb8cb086cb1ccda423cbfda9aef7458e575aa Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Tue, 22 Sep 2020 09:37:04 +0800 Subject: [PATCH 05/24] update --- .../hudi/common/model/TestOverwriteWithLatestAvroPayload.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java index 3a13237af8bee..204275c419601 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java @@ -71,7 +71,6 @@ public void testActiveRecords() throws IOException { OverwriteWithLatestAvroPayload payload1 = new OverwriteWithLatestAvroPayload(record1, 1); OverwriteWithLatestAvroPayload payload2 = new OverwriteWithLatestAvroPayload(record2, 2); - OverwriteWithLatestAvroPayload payload3 = new OverwriteWithLatestAvroPayload(record3, 2); assertEquals(payload1.preCombine(payload2), payload2); assertEquals(payload2.preCombine(payload1), payload2); @@ -82,7 +81,7 @@ public void testActiveRecords() throws IOException { assertEquals(payload2.combineAndGetUpdateValue(record1, schema).get(), record2); assertEquals(HoodieAvroUtils.bytesToAvro(payload1.preCombine(payload2, schema).recordBytes,schema), - HoodieAvroUtils.bytesToAvro(payload3.recordBytes, schema)); + record3); } @Test From f9e3df9d237b93494ebfa33f6d51a3aae7ba157a Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Wed, 23 Sep 2020 21:45:30 +0800 Subject: [PATCH 06/24] preCombine all HoodieRecords and update all fields according to orderingVal --- .../apache/hudi/client/HoodieWriteClient.java | 6 +++- .../bootstrap/BootstrapRecordPayload.java | 7 ++++ .../apache/hudi/config/HoodieWriteConfig.java | 8 +++++ .../hudi/table/HoodieCopyOnWriteTable.java | 5 +-- .../hudi/table/HoodieMergeOnReadTable.java | 5 +-- .../org/apache/hudi/table/HoodieTable.java | 2 +- .../table/action/commit/BulkInsertHelper.java | 2 +- .../commit/UpsertCommitActionExecutor.java | 17 ++++++++-- .../hudi/table/action/commit/WriteHelper.java | 33 ++++++++++++++----- .../UpsertDeltaCommitActionExecutor.java | 17 ++++++++-- .../TestHoodieClientOnCopyOnWriteStorage.java | 5 +-- .../apache/hudi/common/HoodieJsonPayload.java | 5 +++ .../hudi/common/model/BaseAvroPayload.java | 2 +- .../model/EmptyHoodieRecordPayload.java | 7 ++++ .../hudi/common/model/HoodieAvroPayload.java | 5 +++ .../common/model/HoodieRecordPayload.java | 16 +++++++++ .../model/OverwriteWithLatestAvroPayload.java | 33 +++++++++++++++++++ .../TestOverwriteWithLatestAvroPayload.java | 14 ++++++-- .../testutils/AvroBinaryTestPayload.java | 5 +++ .../common/testutils/RawTripTestPayload.java | 5 +++ .../java/org/apache/hudi/DataSourceUtils.java | 4 +-- .../apache/hudi/HoodieSparkSqlWriter.scala | 3 +- .../org/apache/hudi/TestDataSourceUtils.java | 6 ++-- 23 files changed, 180 insertions(+), 32 deletions(-) diff --git a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java index 142ff330a87a9..2a10c19741014 100644 --- a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java +++ b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java @@ -186,11 +186,15 @@ protected void rollBackInflightBootstrap() { * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ public JavaRDD upsert(JavaRDD> records, final String instantTime) { + return upsert(records, instantTime, null); + } + + public JavaRDD upsert(JavaRDD> records, final String instantTime, String schema) { HoodieTable table = getTableAndInitCtx(WriteOperationType.UPSERT, instantTime); table.validateUpsertSchema(); setOperationType(WriteOperationType.UPSERT); this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata result = table.upsert(jsc, instantTime, records); + HoodieWriteMetadata result = table.upsert(jsc, instantTime, records, schema); if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); } diff --git a/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java b/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java index fa508e42f120c..7d66b61ae36e6 100644 --- a/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java +++ b/hudi-client/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java @@ -25,6 +25,8 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; +import java.io.IOException; + public class BootstrapRecordPayload implements HoodieRecordPayload { private final GenericRecord record; @@ -38,6 +40,11 @@ public BootstrapRecordPayload preCombine(BootstrapRecordPayload another) { return this; } + @Override + public BootstrapRecordPayload preCombine(BootstrapRecordPayload another, Schema schema) throws IOException { + return this; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) { return Option.ofNullable(record); diff --git a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index 089474d15c2e8..5b00d9d049834 100644 --- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -82,6 +82,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { public static final String DEFAULT_COMBINE_BEFORE_UPSERT = "true"; public static final String COMBINE_BEFORE_DELETE_PROP = "hoodie.combine.before.delete"; public static final String DEFAULT_COMBINE_BEFORE_DELETE = "true"; + public static final String COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP = "hoodie.combine.all.fields.before.upsert"; + public static final String DEFAULT_COMBINE_ALL_FIELDS_BEFORE_UPSERT = "false"; public static final String WRITE_STATUS_STORAGE_LEVEL = "hoodie.write.status.storage.level"; public static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; public static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit"; @@ -236,6 +238,10 @@ public boolean shouldCombineBeforeDelete() { return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_DELETE_PROP)); } + public boolean shouldCombineAllFieldsBeforeUpsert() { + return Boolean.parseBoolean(props.getProperty(COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP)); + } + public boolean shouldAllowMultiWriteOnSameInstant() { return Boolean.parseBoolean(props.getProperty(ALLOW_MULTI_WRITE_ON_SAME_INSTANT)); } @@ -987,6 +993,8 @@ protected void setDefaults() { DEFAULT_COMBINE_BEFORE_UPSERT); setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_DELETE_PROP), COMBINE_BEFORE_DELETE_PROP, DEFAULT_COMBINE_BEFORE_DELETE); + setDefaultOnCondition(props, !props.containsKey(COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP), + COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP, DEFAULT_COMBINE_ALL_FIELDS_BEFORE_UPSERT); setDefaultOnCondition(props, !props.containsKey(ALLOW_MULTI_WRITE_ON_SAME_INSTANT), ALLOW_MULTI_WRITE_ON_SAME_INSTANT, DEFAULT_ALLOW_MULTI_WRITE_ON_SAME_INSTANT); setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), WRITE_STATUS_STORAGE_LEVEL, diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java index 31ba537fa2ce6..20d1fe32fe0f8 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java @@ -83,8 +83,9 @@ public HoodieCopyOnWriteTable(HoodieWriteConfig config, Configuration hadoopConf } @Override - public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records) { - return new UpsertCommitActionExecutor<>(jsc, config, this, instantTime, records).execute(); + public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records, + String schema) { + return new UpsertCommitActionExecutor<>(jsc, config, this, instantTime, records, schema).execute(); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java index a236cdb9411de..a792af8c06bb5 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java @@ -76,8 +76,9 @@ public class HoodieMergeOnReadTable extends Hoodi } @Override - public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records) { - return new UpsertDeltaCommitActionExecutor<>(jsc, config, this, instantTime, records).execute(); + public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records, + String schema) { + return new UpsertDeltaCommitActionExecutor<>(jsc, config, this, instantTime, records, schema).execute(); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java index 71bcb31314a81..0973dc47dd9d6 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -142,7 +142,7 @@ public static HoodieTable create(HoodieTableM * @return HoodieWriteMetadata */ public abstract HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, - JavaRDD> records); + JavaRDD> records, String schema); /** * Insert a batch of new records into Hoodie table at the supplied instantTime. diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertHelper.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertHelper.java index 4683c8218d782..53bd99464a698 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertHelper.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/BulkInsertHelper.java @@ -50,7 +50,7 @@ public static > HoodieWriteMetadata bulkInsert( if (performDedupe) { dedupedRecords = WriteHelper.combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, - config.getBulkInsertShuffleParallelism(), ((HoodieTable)table)); + config.getBulkInsertShuffleParallelism(), ((HoodieTable)table), false, null); } final JavaRDD> repartitionedRecords; diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java index 0c4d08e35ac21..7a6347e0c0a02 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java @@ -32,10 +32,20 @@ public class UpsertCommitActionExecutor> extends CommitActionExecutor { private JavaRDD> inputRecordsRDD; + private String schema; public UpsertCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD, + String schema) { + super(jsc, config, table, instantTime, WriteOperationType.UPSERT); + this.inputRecordsRDD = inputRecordsRDD; + this.schema = schema; + } + + public UpsertCommitActionExecutor(JavaSparkContext jsc, + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD) { super(jsc, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; } @@ -43,6 +53,7 @@ public UpsertCommitActionExecutor(JavaSparkContext jsc, @Override public HoodieWriteMetadata execute() { return WriteHelper.write(instantTime, inputRecordsRDD, jsc, (HoodieTable)table, - config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), this, true); + config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), + config.shouldCombineAllFieldsBeforeUpsert(), schema, this, true); } } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java index 92dcbb628ad15..15545ec7162f2 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java @@ -18,6 +18,7 @@ package org.apache.hudi.table.action.commit; +import org.apache.avro.Schema; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -38,11 +39,23 @@ public class WriteHelper> { public static > HoodieWriteMetadata write(String instantTime, JavaRDD> inputRecordsRDD, JavaSparkContext jsc, HoodieTable table, boolean shouldCombine, - int shuffleParallelism, CommitActionExecutor executor, boolean performTagging) { + int shuffleParallelism, CommitActionExecutor executor, + boolean performTagging) { + return write(instantTime, inputRecordsRDD, jsc, table, shouldCombine, shuffleParallelism,false, + null, executor, performTagging); + } + + public static > HoodieWriteMetadata write(String instantTime, + JavaRDD> inputRecordsRDD, JavaSparkContext jsc, + HoodieTable table, boolean shouldCombine, + int shuffleParallelism,boolean precombineAgg, + String schema, CommitActionExecutor executor, + boolean performTagging) { try { // De-dupe/merge if needed JavaRDD> dedupedRecords = - combineOnCondition(shouldCombine, inputRecordsRDD, shuffleParallelism, table); + combineOnCondition(shouldCombine, inputRecordsRDD, shuffleParallelism, table, precombineAgg, schema); + Instant lookupBegin = Instant.now(); JavaRDD> taggedRecords = dedupedRecords; @@ -70,8 +83,9 @@ private static > JavaRDD> tag( } public static > JavaRDD> combineOnCondition( - boolean condition, JavaRDD> records, int parallelism, HoodieTable table) { - return condition ? deduplicateRecords(records, table, parallelism) : records; + boolean condition, JavaRDD> records, int parallelism, HoodieTable table, + boolean precombineAgg, String schema) { + return condition ? deduplicateRecords(records, table, parallelism, precombineAgg, schema) : records; } /** @@ -82,12 +96,14 @@ public static > JavaRDD> combin * @return RDD of HoodieRecord already be deduplicated */ public static > JavaRDD> deduplicateRecords( - JavaRDD> records, HoodieTable table, int parallelism) { - return deduplicateRecords(records, table.getIndex(), parallelism); + JavaRDD> records, HoodieTable table, int parallelism, boolean precombineAgg, + String schema) { + return deduplicateRecords(records, table.getIndex(), parallelism, precombineAgg, schema); } public static > JavaRDD> deduplicateRecords( - JavaRDD> records, HoodieIndex index, int parallelism) { + JavaRDD> records, HoodieIndex index, int parallelism, boolean precombineAgg, + String schema) { boolean isIndexingGlobal = index.isGlobal(); return records.mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); @@ -96,7 +112,8 @@ public static > JavaRDD> dedupl return new Tuple2<>(key, record); }).reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") - T reducedData = (T) rec1.getData().preCombine(rec2.getData()); + T reducedData = precombineAgg && schema != null ? (T) rec1.getData().preCombine(rec2.getData(),new Schema.Parser().parse(schema)) + : (T) rec1.getData().preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java index 1809078cb85e3..88c95155aa9ab 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java @@ -33,10 +33,20 @@ public class UpsertDeltaCommitActionExecutor> extends DeltaCommitActionExecutor { private JavaRDD> inputRecordsRDD; + private String schema; public UpsertDeltaCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD, + String schema) { + super(jsc, config, table, instantTime, WriteOperationType.UPSERT); + this.inputRecordsRDD = inputRecordsRDD; + this.schema = schema; + } + + public UpsertDeltaCommitActionExecutor(JavaSparkContext jsc, + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD) { super(jsc, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; } @@ -44,6 +54,7 @@ public UpsertDeltaCommitActionExecutor(JavaSparkContext jsc, @Override public HoodieWriteMetadata execute() { return WriteHelper.write(instantTime, inputRecordsRDD, jsc, (HoodieTable) table, - config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(),this, true); + config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), + config.shouldCombineAllFieldsBeforeUpsert(),schema, this, true); } } diff --git a/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java index ba4ffb4731e23..5ced1c343865c 100644 --- a/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java @@ -231,14 +231,15 @@ private void testDeduplication( // Global dedup should be done based on recordKey only HoodieIndex index = mock(HoodieIndex.class); when(index.isGlobal()).thenReturn(true); - List> dedupedRecs = WriteHelper.deduplicateRecords(records, index, 1).collect(); + List> dedupedRecs = WriteHelper.deduplicateRecords(records, index, 1, + false, null).collect(); assertEquals(1, dedupedRecs.size()); assertNodupesWithinPartition(dedupedRecs); // non-Global dedup should be done based on both recordKey and partitionPath index = mock(HoodieIndex.class); when(index.isGlobal()).thenReturn(false); - dedupedRecs = WriteHelper.deduplicateRecords(records, index, 1).collect(); + dedupedRecs = WriteHelper.deduplicateRecords(records, index, 1, false, null).collect(); assertEquals(2, dedupedRecs.size()); assertNodupesWithinPartition(dedupedRecs); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java index 1c15c66410e50..efd2a68bf71ff 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java @@ -54,6 +54,11 @@ public HoodieJsonPayload preCombine(HoodieJsonPayload another) { return this; } + @Override + public HoodieJsonPayload preCombine(HoodieJsonPayload another, Schema schema) throws IOException { + return this; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java index 3b35b0d4dca16..72f129f6c7e3a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/BaseAvroPayload.java @@ -32,7 +32,7 @@ public abstract class BaseAvroPayload implements Serializable { /** * Avro data extracted from the source converted to bytes. */ - public final byte[] recordBytes; + public byte[] recordBytes; /** * For purposes of preCombining. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java index 783422fc648f2..9639683a80324 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java @@ -24,6 +24,8 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; +import java.io.IOException; + /** * Empty payload used for deletions. */ @@ -40,6 +42,11 @@ public EmptyHoodieRecordPayload preCombine(EmptyHoodieRecordPayload another) { return another; } + @Override + public EmptyHoodieRecordPayload preCombine(EmptyHoodieRecordPayload another, Schema schema) throws IOException { + return another; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) { return Option.empty(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java index a3ab2b71ae980..ed454c821d689 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java @@ -50,6 +50,11 @@ public HoodieAvroPayload preCombine(HoodieAvroPayload another) { return this; } + @Override + public HoodieAvroPayload preCombine(HoodieAvroPayload another, Schema schema) throws IOException { + return this; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java index 1afdd1b59af64..fe529d61cfcbf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java @@ -44,6 +44,22 @@ public interface HoodieRecordPayload extends Seri @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) T preCombine(T another); + /** + * When more than one HoodieRecord have the same HoodieKey, this function combines all fields(which is not null) + * before attempting to insert/upsert (if combining turned on in HoodieClientConfig). + * eg: 1) + * Before: + * id name age ts + * 1 Karl null 0.0 + * 1 null 18 0.0 + * After: + * id name age ts + * 1 Karl 18 0.0 + * + */ + @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + T preCombine(T another, Schema schema) throws IOException; + /** * This methods lets you write custom merging/combining logic to produce new values as a function of current value on * storage and whats contained in this object. diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java index 845967c00ebfb..4fa61ce8ff0d4 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java @@ -26,6 +26,7 @@ import org.apache.avro.generic.IndexedRecord; import java.io.IOException; +import java.util.List; /** * Default payload used for delta streamer. @@ -47,6 +48,26 @@ public OverwriteWithLatestAvroPayload(Option record) { this(record.isPresent() ? record.get() : null, (record1) -> 0); // natural order } + @Override + public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another,Schema schema) throws IOException { + // pick the payload with greatest ordering value and aggregate all the fields,choosing the + // value that is not null + GenericRecord thisValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(this.recordBytes, schema); + GenericRecord anotherValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(another.recordBytes,schema); + List fields = schema.getFields(); + + if (another.orderingVal.compareTo(orderingVal) > 0) { + GenericRecord anotherRoc = combineAllFields(fields,anotherValue,thisValue); + another.recordBytes = HoodieAvroUtils.avroToBytes(anotherRoc); + return another; + } else { + GenericRecord thisRoc = combineAllFields(fields,thisValue,anotherValue); + this.recordBytes = HoodieAvroUtils.avroToBytes(thisRoc); + return this; + } + + } + @Override public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another) { // pick the payload with greatest ordering value @@ -57,6 +78,18 @@ public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload } } + public GenericRecord combineAllFields(List fields,GenericRecord priorRec,GenericRecord inferiorRoc) { + for (int i = 0; i < fields.size(); i++) { + Object priorValue = priorRec.get(fields.get(i).name()); + Object inferiorValue = inferiorRoc.get(fields.get(i).name()); + Object defaultVal = fields.get(i).defaultVal(); + if (overwriteField(priorValue,defaultVal) && !overwriteField(inferiorValue,defaultVal)) { + priorRec.put(fields.get(i).name(), inferiorValue); + } + } + return priorRec; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java index 7c5951a7cac04..204275c419601 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java @@ -22,6 +22,7 @@ import org.apache.avro.Schema.Type; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.avro.HoodieAvroUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -42,7 +43,7 @@ public class TestOverwriteWithLatestAvroPayload { public void setUp() throws Exception { schema = Schema.createRecord(Arrays.asList( new Schema.Field("id", Schema.create(Schema.Type.STRING), "", null), - new Schema.Field("partition", Schema.create(Schema.Type.STRING), "", null), + new Schema.Field("partition", Schema.create(Schema.Type.STRING), "", ""), new Schema.Field("ts", Schema.create(Schema.Type.LONG), "", null), new Schema.Field("_hoodie_is_deleted", Schema.create(Type.BOOLEAN), "", false) )); @@ -58,10 +59,16 @@ public void testActiveRecords() throws IOException { GenericRecord record2 = new GenericData.Record(schema); record2.put("id", "2"); - record2.put("partition", "partition1"); + record2.put("partition", ""); record2.put("ts", 1L); record2.put("_hoodie_is_deleted", false); + GenericRecord record3 = new GenericData.Record(schema); + record3.put("id", "2"); + record3.put("partition", "partition0"); + record3.put("ts", 1L); + record3.put("_hoodie_is_deleted", false); + OverwriteWithLatestAvroPayload payload1 = new OverwriteWithLatestAvroPayload(record1, 1); OverwriteWithLatestAvroPayload payload2 = new OverwriteWithLatestAvroPayload(record2, 2); assertEquals(payload1.preCombine(payload2), payload2); @@ -72,6 +79,9 @@ public void testActiveRecords() throws IOException { assertEquals(payload1.combineAndGetUpdateValue(record2, schema).get(), record1); assertEquals(payload2.combineAndGetUpdateValue(record1, schema).get(), record2); + + assertEquals(HoodieAvroUtils.bytesToAvro(payload1.preCombine(payload2, schema).recordBytes,schema), + record3); } @Test diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java index ff862ee7b7f7f..a45f196017a14 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java @@ -48,6 +48,11 @@ public HoodieRecordPayload preCombine(HoodieRecordPayload another) { return this; } + @Override + public HoodieRecordPayload preCombine(HoodieRecordPayload another, Schema schema) throws IOException { + return this; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java index 8442aff084a49..d44c2d56fa634 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java @@ -103,6 +103,11 @@ public RawTripTestPayload preCombine(RawTripTestPayload another) { return another; } + @Override + public RawTripTestPayload preCombine(RawTripTestPayload another, Schema schema) throws IOException { + return another; + } + @Override public Option combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { return this.getInsertValue(schema); diff --git a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java index cbc5b030c30df..73c51234d08a4 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -175,7 +175,7 @@ public static HoodieWriteClient createHoodieClient(JavaSparkContext jssc, String } public static JavaRDD doWriteOperation(HoodieWriteClient client, JavaRDD hoodieRecords, - String instantTime, WriteOperationType operation) throws HoodieException { + String instantTime, WriteOperationType operation, String schema) throws HoodieException { switch (operation) { case BULK_INSERT: Option userDefinedBulkInsertPartitioner = @@ -184,7 +184,7 @@ public static JavaRDD doWriteOperation(HoodieWriteClient client, Ja case INSERT: return client.insert(hoodieRecords, instantTime); case UPSERT: - return client.upsert(hoodieRecords, instantTime); + return client.upsert(hoodieRecords, instantTime, schema); default: throw new HoodieException("Not a valid operation type for doWriteOperation: " + operation.toString()); } diff --git a/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index 450bd73e455f5..f272875ee3a27 100644 --- a/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -165,7 +165,8 @@ private[hudi] object HoodieSparkSqlWriter { (true, common.util.Option.empty()) } client.startCommitWithTime(instantTime) - val writeStatuses = DataSourceUtils.doWriteOperation(client, hoodieRecords, instantTime, operation) + val writeStatuses = DataSourceUtils.doWriteOperation(client, hoodieRecords, instantTime, operation + , schema.toString()) (writeStatuses, client) } else { val structName = s"${tblName}_record" diff --git a/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java b/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java index 9ff114e46431d..279df0d2e3e62 100644 --- a/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java +++ b/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java @@ -131,7 +131,7 @@ public void testDoWriteOperationWithoutUserDefinedBulkInsertPartitioner() throws when(hoodieWriteClient.getConfig()).thenReturn(config); DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT); + WriteOperationType.BULK_INSERT, null); verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), optionCaptor.capture()); @@ -144,7 +144,7 @@ public void testDoWriteOperationWithNonExistUserDefinedBulkInsertPartitioner() t Exception exception = assertThrows(HoodieException.class, () -> { DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT); + WriteOperationType.BULK_INSERT, null); }); assertThat(exception.getMessage(), containsString("Could not create UserDefinedBulkInsertPartitioner")); @@ -155,7 +155,7 @@ public void testDoWriteOperationWithUserDefinedBulkInsertPartitioner() throws Ho setAndVerifyHoodieWriteClientWith(NoOpBulkInsertPartitioner.class.getName()); DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT); + WriteOperationType.BULK_INSERT, null); verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), optionCaptor.capture()); From ec6e8a260bafbd9898f609c0616bde5f317b3442 Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Thu, 1 Oct 2020 15:07:09 +0800 Subject: [PATCH 07/24] update --- .../apache/hudi/client/HoodieWriteClient.java | 16 ++-------------- .../hudi/table/HoodieCopyOnWriteTable.java | 5 ++--- .../hudi/table/HoodieMergeOnReadTable.java | 5 ++--- .../org/apache/hudi/table/HoodieTable.java | 2 +- .../commit/UpsertCommitActionExecutor.java | 17 ++++------------- .../hudi/table/action/commit/WriteHelper.java | 19 ++++++++++--------- .../UpsertDeltaCommitActionExecutor.java | 17 ++++------------- .../java/org/apache/hudi/DataSourceUtils.java | 6 ++++-- .../apache/hudi/HoodieSparkSqlWriter.scala | 3 +-- .../org/apache/hudi/TestDataSourceUtils.java | 6 +++--- 10 files changed, 33 insertions(+), 63 deletions(-) diff --git a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java index 3e741701c4192..142ff330a87a9 100644 --- a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java +++ b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java @@ -190,19 +190,7 @@ public JavaRDD upsert(JavaRDD> records, final Strin table.validateUpsertSchema(); setOperationType(WriteOperationType.UPSERT); this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata result = table.upsert(jsc, instantTime, records, null); - if (result.getIndexLookupDuration().isPresent()) { - metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); - } - return postWrite(result, instantTime, table); - } - - public JavaRDD upsert(JavaRDD> records, final String instantTime, String schema) { - HoodieTable table = getTableAndInitCtx(WriteOperationType.UPSERT, instantTime); - table.validateUpsertSchema(); - setOperationType(WriteOperationType.UPSERT); - this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata result = table.upsert(jsc, instantTime, records, schema); + HoodieWriteMetadata result = table.upsert(jsc, instantTime, records); if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); } @@ -210,7 +198,7 @@ public JavaRDD upsert(JavaRDD> records, final Strin } /** - * Upserts the given prepared records into the Hoodie table,/TestHoodieClientOnCopyOnWriteStorage at the supplied instantTime. + * Upserts the given prepared records into the Hoodie table, at the supplied instantTime. *

* This implementation requires that the input records are already tagged, and de-duped if needed. * diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java index 20d1fe32fe0f8..31ba537fa2ce6 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java @@ -83,9 +83,8 @@ public HoodieCopyOnWriteTable(HoodieWriteConfig config, Configuration hadoopConf } @Override - public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records, - String schema) { - return new UpsertCommitActionExecutor<>(jsc, config, this, instantTime, records, schema).execute(); + public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records) { + return new UpsertCommitActionExecutor<>(jsc, config, this, instantTime, records).execute(); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java index a792af8c06bb5..a236cdb9411de 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java @@ -76,9 +76,8 @@ public class HoodieMergeOnReadTable extends Hoodi } @Override - public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records, - String schema) { - return new UpsertDeltaCommitActionExecutor<>(jsc, config, this, instantTime, records, schema).execute(); + public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records) { + return new UpsertDeltaCommitActionExecutor<>(jsc, config, this, instantTime, records).execute(); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java index 0973dc47dd9d6..71bcb31314a81 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -142,7 +142,7 @@ public static HoodieTable create(HoodieTableM * @return HoodieWriteMetadata */ public abstract HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, - JavaRDD> records, String schema); + JavaRDD> records); /** * Insert a batch of new records into Hoodie table at the supplied instantTime. diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java index 7a6347e0c0a02..06cbf2294a4b3 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java @@ -21,6 +21,7 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; @@ -32,20 +33,10 @@ public class UpsertCommitActionExecutor> extends CommitActionExecutor { private JavaRDD> inputRecordsRDD; - private String schema; public UpsertCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD, - String schema) { - super(jsc, config, table, instantTime, WriteOperationType.UPSERT); - this.inputRecordsRDD = inputRecordsRDD; - this.schema = schema; - } - - public UpsertCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD) { super(jsc, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; } @@ -54,6 +45,6 @@ public UpsertCommitActionExecutor(JavaSparkContext jsc, public HoodieWriteMetadata execute() { return WriteHelper.write(instantTime, inputRecordsRDD, jsc, (HoodieTable)table, config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), - config.shouldCombineAllFieldsBeforeUpsert(), schema, this, true); + config.shouldCombineAllFieldsBeforeUpsert(), Option.of(config.getSchema()), this, true); } } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java index 15545ec7162f2..8058d477fc5ca 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java @@ -22,6 +22,7 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; @@ -46,11 +47,11 @@ public static > HoodieWriteMetadata write(Strin } public static > HoodieWriteMetadata write(String instantTime, - JavaRDD> inputRecordsRDD, JavaSparkContext jsc, - HoodieTable table, boolean shouldCombine, - int shuffleParallelism,boolean precombineAgg, - String schema, CommitActionExecutor executor, - boolean performTagging) { + JavaRDD> inputRecordsRDD, JavaSparkContext jsc, + HoodieTable table, boolean shouldCombine, + int shuffleParallelism, boolean precombineAgg, + Option schema, CommitActionExecutor executor, + boolean performTagging) { try { // De-dupe/merge if needed JavaRDD> dedupedRecords = @@ -84,7 +85,7 @@ private static > JavaRDD> tag( public static > JavaRDD> combineOnCondition( boolean condition, JavaRDD> records, int parallelism, HoodieTable table, - boolean precombineAgg, String schema) { + boolean precombineAgg, Option schema) { return condition ? deduplicateRecords(records, table, parallelism, precombineAgg, schema) : records; } @@ -97,13 +98,13 @@ public static > JavaRDD> combin */ public static > JavaRDD> deduplicateRecords( JavaRDD> records, HoodieTable table, int parallelism, boolean precombineAgg, - String schema) { + Option schema) { return deduplicateRecords(records, table.getIndex(), parallelism, precombineAgg, schema); } public static > JavaRDD> deduplicateRecords( JavaRDD> records, HoodieIndex index, int parallelism, boolean precombineAgg, - String schema) { + Option schema) { boolean isIndexingGlobal = index.isGlobal(); return records.mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); @@ -112,7 +113,7 @@ public static > JavaRDD> dedupl return new Tuple2<>(key, record); }).reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") - T reducedData = precombineAgg && schema != null ? (T) rec1.getData().preCombine(rec2.getData(),new Schema.Parser().parse(schema)) + T reducedData = precombineAgg && !schema.get().isEmpty() ? (T) rec1.getData().preCombine(rec2.getData(),new Schema.Parser().parse(schema.get())) : (T) rec1.getData().preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java index 88c95155aa9ab..e3d35093ad492 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java @@ -21,6 +21,7 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; @@ -33,20 +34,10 @@ public class UpsertDeltaCommitActionExecutor> extends DeltaCommitActionExecutor { private JavaRDD> inputRecordsRDD; - private String schema; public UpsertDeltaCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD, - String schema) { - super(jsc, config, table, instantTime, WriteOperationType.UPSERT); - this.inputRecordsRDD = inputRecordsRDD; - this.schema = schema; - } - - public UpsertDeltaCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD) { super(jsc, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; } @@ -55,6 +46,6 @@ public UpsertDeltaCommitActionExecutor(JavaSparkContext jsc, public HoodieWriteMetadata execute() { return WriteHelper.write(instantTime, inputRecordsRDD, jsc, (HoodieTable) table, config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), - config.shouldCombineAllFieldsBeforeUpsert(),schema, this, true); + config.shouldCombineAllFieldsBeforeUpsert(),Option.of(config.getSchema()), this, true); } } diff --git a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java index 73c51234d08a4..fa573b2afe619 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -50,6 +50,8 @@ import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import scala.Some; +import scala.Some$; import java.io.IOException; import java.util.ArrayList; @@ -175,7 +177,7 @@ public static HoodieWriteClient createHoodieClient(JavaSparkContext jssc, String } public static JavaRDD doWriteOperation(HoodieWriteClient client, JavaRDD hoodieRecords, - String instantTime, WriteOperationType operation, String schema) throws HoodieException { + String instantTime, WriteOperationType operation) throws HoodieException { switch (operation) { case BULK_INSERT: Option userDefinedBulkInsertPartitioner = @@ -184,7 +186,7 @@ public static JavaRDD doWriteOperation(HoodieWriteClient client, Ja case INSERT: return client.insert(hoodieRecords, instantTime); case UPSERT: - return client.upsert(hoodieRecords, instantTime, schema); + return client.upsert(hoodieRecords, instantTime); default: throw new HoodieException("Not a valid operation type for doWriteOperation: " + operation.toString()); } diff --git a/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index 2636481fa208d..569ed345f1982 100644 --- a/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -164,8 +164,7 @@ private[hudi] object HoodieSparkSqlWriter { (true, common.util.Option.empty()) } client.startCommitWithTime(instantTime) - val writeStatuses = DataSourceUtils.doWriteOperation(client, hoodieRecords, instantTime, operation - , schema.toString()) + val writeStatuses = DataSourceUtils.doWriteOperation(client, hoodieRecords, instantTime, operation) (writeStatuses, client) } else { val structName = s"${tblName}_record" diff --git a/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java b/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java index 279df0d2e3e62..9ff114e46431d 100644 --- a/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java +++ b/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java @@ -131,7 +131,7 @@ public void testDoWriteOperationWithoutUserDefinedBulkInsertPartitioner() throws when(hoodieWriteClient.getConfig()).thenReturn(config); DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT, null); + WriteOperationType.BULK_INSERT); verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), optionCaptor.capture()); @@ -144,7 +144,7 @@ public void testDoWriteOperationWithNonExistUserDefinedBulkInsertPartitioner() t Exception exception = assertThrows(HoodieException.class, () -> { DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT, null); + WriteOperationType.BULK_INSERT); }); assertThat(exception.getMessage(), containsString("Could not create UserDefinedBulkInsertPartitioner")); @@ -155,7 +155,7 @@ public void testDoWriteOperationWithUserDefinedBulkInsertPartitioner() throws Ho setAndVerifyHoodieWriteClientWith(NoOpBulkInsertPartitioner.class.getName()); DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT, null); + WriteOperationType.BULK_INSERT); verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), optionCaptor.capture()); From bf5f22bb2778f6134f61c52c2fe75e1c0dff2e5f Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Thu, 1 Oct 2020 16:28:02 +0800 Subject: [PATCH 08/24] update --- .../java/org/apache/hudi/table/action/commit/WriteHelper.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java index 8058d477fc5ca..3720855d83f0f 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/WriteHelper.java @@ -113,8 +113,8 @@ public static > JavaRDD> dedupl return new Tuple2<>(key, record); }).reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") - T reducedData = precombineAgg && !schema.get().isEmpty() ? (T) rec1.getData().preCombine(rec2.getData(),new Schema.Parser().parse(schema.get())) - : (T) rec1.getData().preCombine(rec2.getData()); + T reducedData = precombineAgg && !schema.get().isEmpty() ? (T) rec1.getData().preCombine(rec2.getData(), + new Schema.Parser().parse(schema.get())) : (T) rec1.getData().preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. From 54e2a08b01a61e006eadddbfe9c72de134c6af8f Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Thu, 1 Oct 2020 16:42:02 +0800 Subject: [PATCH 09/24] update --- .../org/apache/hudi/client/HoodieWriteClient.java | 6 +----- .../apache/hudi/table/HoodieCopyOnWriteTable.java | 5 ++--- .../apache/hudi/table/HoodieMergeOnReadTable.java | 5 ++--- .../java/org/apache/hudi/table/HoodieTable.java | 2 +- .../action/commit/UpsertCommitActionExecutor.java | 15 ++------------- .../UpsertDeltaCommitActionExecutor.java | 14 ++------------ .../java/org/apache/hudi/TestDataSourceUtils.java | 6 +++--- 7 files changed, 13 insertions(+), 40 deletions(-) diff --git a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java index 7439f945dc453..7970623eeff52 100644 --- a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java +++ b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java @@ -186,15 +186,11 @@ protected void rollBackInflightBootstrap() { * @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts */ public JavaRDD upsert(JavaRDD> records, final String instantTime) { - return upsert(records, instantTime, null); - } - - public JavaRDD upsert(JavaRDD> records, final String instantTime, String schema) { HoodieTable table = getTableAndInitCtx(WriteOperationType.UPSERT, instantTime); table.validateUpsertSchema(); setOperationType(WriteOperationType.UPSERT); this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata result = table.upsert(jsc, instantTime, records, schema); + HoodieWriteMetadata result = table.upsert(jsc, instantTime, records); if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java index 89ad5d8a84d97..a4bcbbf835efb 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieCopyOnWriteTable.java @@ -84,9 +84,8 @@ public HoodieCopyOnWriteTable(HoodieWriteConfig config, Configuration hadoopConf } @Override - public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records, - String schema) { - return new UpsertCommitActionExecutor<>(jsc, config, this, instantTime, records, schema).execute(); + public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records) { + return new UpsertCommitActionExecutor<>(jsc, config, this, instantTime, records).execute(); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java index a792af8c06bb5..a236cdb9411de 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieMergeOnReadTable.java @@ -76,9 +76,8 @@ public class HoodieMergeOnReadTable extends Hoodi } @Override - public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records, - String schema) { - return new UpsertDeltaCommitActionExecutor<>(jsc, config, this, instantTime, records, schema).execute(); + public HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, JavaRDD> records) { + return new UpsertDeltaCommitActionExecutor<>(jsc, config, this, instantTime, records).execute(); } @Override diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java index 1eaadd94b7146..5c824a6fc92f3 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -142,7 +142,7 @@ public static HoodieTable create(HoodieTableM * @return HoodieWriteMetadata */ public abstract HoodieWriteMetadata upsert(JavaSparkContext jsc, String instantTime, - JavaRDD> records, String schema); + JavaRDD> records); /** * Insert a batch of new records into Hoodie table at the supplied instantTime. diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java index 26ae5480fb2cf..40dc087633c98 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/commit/UpsertCommitActionExecutor.java @@ -33,20 +33,10 @@ public class UpsertCommitActionExecutor> extends CommitActionExecutor { private JavaRDD> inputRecordsRDD; - private String schema; public UpsertCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD, - String schema) { - super(jsc, config, table, instantTime, WriteOperationType.UPSERT); - this.inputRecordsRDD = inputRecordsRDD; - this.schema = schema; - } - - public UpsertCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD) { super(jsc, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; } @@ -55,7 +45,6 @@ public UpsertCommitActionExecutor(JavaSparkContext jsc, public HoodieWriteMetadata execute() { return WriteHelper.write(instantTime, inputRecordsRDD, jsc, (HoodieTable)table, config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), - config.shouldCombineAllFieldsBeforeUpsert(), Option.of(config.getSchema()), this, true); } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java index ef20588b9c7db..e3d35093ad492 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/deltacommit/UpsertDeltaCommitActionExecutor.java @@ -34,20 +34,10 @@ public class UpsertDeltaCommitActionExecutor> extends DeltaCommitActionExecutor { private JavaRDD> inputRecordsRDD; - private String schema; public UpsertDeltaCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD, - String schema) { - super(jsc, config, table, instantTime, WriteOperationType.UPSERT); - this.inputRecordsRDD = inputRecordsRDD; - this.schema = schema; - } - - public UpsertDeltaCommitActionExecutor(JavaSparkContext jsc, - HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + HoodieWriteConfig config, HoodieTable table, + String instantTime, JavaRDD> inputRecordsRDD) { super(jsc, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; } diff --git a/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java b/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java index 279df0d2e3e62..9ff114e46431d 100644 --- a/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java +++ b/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java @@ -131,7 +131,7 @@ public void testDoWriteOperationWithoutUserDefinedBulkInsertPartitioner() throws when(hoodieWriteClient.getConfig()).thenReturn(config); DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT, null); + WriteOperationType.BULK_INSERT); verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), optionCaptor.capture()); @@ -144,7 +144,7 @@ public void testDoWriteOperationWithNonExistUserDefinedBulkInsertPartitioner() t Exception exception = assertThrows(HoodieException.class, () -> { DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT, null); + WriteOperationType.BULK_INSERT); }); assertThat(exception.getMessage(), containsString("Could not create UserDefinedBulkInsertPartitioner")); @@ -155,7 +155,7 @@ public void testDoWriteOperationWithUserDefinedBulkInsertPartitioner() throws Ho setAndVerifyHoodieWriteClientWith(NoOpBulkInsertPartitioner.class.getName()); DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT, null); + WriteOperationType.BULK_INSERT); verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), optionCaptor.capture()); From 11323b228fcafe4207e70f73f2265f6e73d94c34 Mon Sep 17 00:00:00 2001 From: Karl_Wang Date: Thu, 1 Oct 2020 16:43:58 +0800 Subject: [PATCH 10/24] Update DataSourceUtils.java --- hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java index 3e41edc8550fb..b61d3a6b38188 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -52,8 +52,6 @@ import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import scala.Some; -import scala.Some$; import java.io.IOException; import java.util.ArrayList; From f67896492a457f245a8bb058a93cda0e10549a3a Mon Sep 17 00:00:00 2001 From: Karl_Wang Date: Sat, 10 Oct 2020 22:18:02 +0800 Subject: [PATCH 11/24] Update AbstractWriteHelper.java --- .../apache/hudi/table/action/commit/AbstractWriteHelper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java index f27db573ef384..0ddff9fa59a1f 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java @@ -40,7 +40,7 @@ public HoodieWriteMetadata write(String instantTime, int shuffleParallelism, BaseCommitActionExecutor executor, boolean performTagging) { - return write(instantTime, inputRecordsRDD, context, table, shouldCombine, shuffleParallelism,false, + return write(instantTime, inputRecords, context, table, shouldCombine, shuffleParallelism,false, null, executor, performTagging); } From cca47ce3f28c5e115afa003e2348b9f08cc6ed21 Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Tue, 13 Oct 2020 17:58:22 +0800 Subject: [PATCH 12/24] update --- .../bootstrap/BootstrapRecordPayload.java | 7 -- .../apache/hudi/config/HoodieWriteConfig.java | 8 -- .../action/commit/AbstractWriteHelper.java | 18 ++-- .../SparkUpsertCommitActionExecutor.java | 2 +- .../table/action/commit/SparkWriteHelper.java | 6 +- .../SparkUpsertDeltaCommitActionExecutor.java | 2 +- .../apache/hudi/common/HoodieJsonPayload.java | 5 -- .../model/EmptyHoodieRecordPayload.java | 7 -- .../hudi/common/model/HoodieAvroPayload.java | 5 -- .../common/model/HoodieRecordPayload.java | 14 +--- .../model/OverwritePrecombineAvroPayload.java | 66 +++++++++++++++ .../model/OverwriteWithLatestAvroPayload.java | 33 -------- .../hudi/common/model/PartialAvroPayload.java | 84 +++++++++++++++++++ .../TestOverwritePrecombineAvroPayload.java | 66 +++++++++++++++ .../TestOverwriteWithLatestAvroPayload.java | 14 +--- .../testutils/AvroBinaryTestPayload.java | 5 -- .../common/testutils/RawTripTestPayload.java | 5 -- 17 files changed, 235 insertions(+), 112 deletions(-) create mode 100644 hudi-common/src/main/java/org/apache/hudi/common/model/OverwritePrecombineAvroPayload.java create mode 100644 hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java create mode 100644 hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java index 7d66b61ae36e6..fa508e42f120c 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/bootstrap/BootstrapRecordPayload.java @@ -25,8 +25,6 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; -import java.io.IOException; - public class BootstrapRecordPayload implements HoodieRecordPayload { private final GenericRecord record; @@ -40,11 +38,6 @@ public BootstrapRecordPayload preCombine(BootstrapRecordPayload another) { return this; } - @Override - public BootstrapRecordPayload preCombine(BootstrapRecordPayload another, Schema schema) throws IOException { - return this; - } - @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) { return Option.ofNullable(record); diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index f2283fcdcd5ba..bb6560057509a 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -81,8 +81,6 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { public static final String DEFAULT_COMBINE_BEFORE_UPSERT = "true"; public static final String COMBINE_BEFORE_DELETE_PROP = "hoodie.combine.before.delete"; public static final String DEFAULT_COMBINE_BEFORE_DELETE = "true"; - public static final String COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP = "hoodie.combine.all.fields.before.upsert"; - public static final String DEFAULT_COMBINE_ALL_FIELDS_BEFORE_UPSERT = "false"; public static final String WRITE_STATUS_STORAGE_LEVEL = "hoodie.write.status.storage.level"; public static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; public static final String HOODIE_AUTO_COMMIT_PROP = "hoodie.auto.commit"; @@ -239,10 +237,6 @@ public boolean shouldCombineBeforeDelete() { return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_DELETE_PROP)); } - public boolean shouldCombineAllFieldsBeforeUpsert() { - return Boolean.parseBoolean(props.getProperty(COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP)); - } - public boolean shouldAllowMultiWriteOnSameInstant() { return Boolean.parseBoolean(props.getProperty(ALLOW_MULTI_WRITE_ON_SAME_INSTANT)); } @@ -1007,8 +1001,6 @@ protected void setDefaults() { DEFAULT_COMBINE_BEFORE_UPSERT); setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_DELETE_PROP), COMBINE_BEFORE_DELETE_PROP, DEFAULT_COMBINE_BEFORE_DELETE); - setDefaultOnCondition(props, !props.containsKey(COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP), - COMBINE_ALL_FIELDS_BEFORE_UPSERT_PROP, DEFAULT_COMBINE_ALL_FIELDS_BEFORE_UPSERT); setDefaultOnCondition(props, !props.containsKey(ALLOW_MULTI_WRITE_ON_SAME_INSTANT), ALLOW_MULTI_WRITE_ON_SAME_INSTANT, DEFAULT_ALLOW_MULTI_WRITE_ON_SAME_INSTANT); setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), WRITE_STATUS_STORAGE_LEVEL, diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java index 0ddff9fa59a1f..1b97804fcb8e2 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java @@ -40,7 +40,7 @@ public HoodieWriteMetadata write(String instantTime, int shuffleParallelism, BaseCommitActionExecutor executor, boolean performTagging) { - return write(instantTime, inputRecords, context, table, shouldCombine, shuffleParallelism,false, + return write(instantTime, inputRecords, context, table, shouldCombine, shuffleParallelism, null, executor, performTagging); } @@ -49,15 +49,14 @@ public HoodieWriteMetadata write(String instantTime, HoodieEngineContext context, HoodieTable table, boolean shouldCombine, - int shuffleParallelism, boolean precombineAgg, + int shuffleParallelism, Option schema, BaseCommitActionExecutor executor, boolean performTagging) { try { // De-dupe/merge if needed I dedupedRecords = - combineOnCondition(shouldCombine, inputRecordsRDD, shuffleParallelism, table, - precombineAgg, schema); + combineOnCondition(shouldCombine, inputRecordsRDD, shuffleParallelism, table, schema); Instant lookupBegin = Instant.now(); I taggedRecords = dedupedRecords; @@ -85,9 +84,8 @@ private I tag( } public I combineOnCondition( - boolean condition, I records, int parallelism, HoodieTable table, - boolean precombineAgg, Option schema) { - return condition ? deduplicateRecords(records, table, parallelism, precombineAgg, schema) : records; + boolean condition, I records, int parallelism, HoodieTable table, Option schema) { + return condition ? deduplicateRecords(records, table, parallelism, schema) : records; } /** @@ -98,10 +96,10 @@ public I combineOnCondition( * @return Collection of HoodieRecord already be deduplicated */ public I deduplicateRecords( - I records, HoodieTable table, int parallelism, boolean precombineAgg, Option schema) { - return deduplicateRecords(records, table.getIndex(), parallelism, precombineAgg, schema); + I records, HoodieTable table, int parallelism, Option schema) { + return deduplicateRecords(records, table.getIndex(), parallelism, schema); } public abstract I deduplicateRecords( - I records, HoodieIndex index, int parallelism, boolean precombineAgg, Option schema); + I records, HoodieIndex index, int parallelism, Option schema); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java index b57d2b0f8f4bf..8cd8de34a272e 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java @@ -46,6 +46,6 @@ public SparkUpsertCommitActionExecutor(HoodieSparkEngineContext context, public HoodieWriteMetadata> execute() { return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), - config.shouldCombineAllFieldsBeforeUpsert(), Option.apply(config.getSchema()), this, true); + Option.apply(config.getSchema()), this, true); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java index e35515f90f309..be27da3406e60 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java @@ -51,7 +51,7 @@ public static SparkWriteHelper newInstance() { @Override public JavaRDD> deduplicateRecords(JavaRDD> records, HoodieIndex>, JavaRDD, JavaRDD> index, - int parallelism, boolean precombineAgg, Option schema) { + int parallelism, Option schema) { boolean isIndexingGlobal = index.isGlobal(); return records.mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); @@ -60,8 +60,8 @@ public JavaRDD> deduplicateRecords(JavaRDD> reco return new Tuple2<>(key, record); }).reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") - T reducedData = precombineAgg && !schema.get().isEmpty() ? (T) rec1.getData().preCombine(rec2.getData(), - new Schema.Parser().parse(schema.get())) : (T) rec1.getData().preCombine(rec2.getData()); + T reducedData = !schema.get().isEmpty() ? (T) rec1.getData().preCombine(rec2.getData(), new Schema.Parser().parse(schema.get())) + : (T) rec1.getData().preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java index 27deabe4841e2..1edc0e3e3ff9a 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java @@ -46,6 +46,6 @@ public SparkUpsertDeltaCommitActionExecutor(HoodieSparkEngineContext context, public HoodieWriteMetadata execute() { return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), - config.shouldCombineAllFieldsBeforeUpsert(),Option.apply(config.getSchema()), this, true); + Option.apply(config.getSchema()), this, true); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java index efd2a68bf71ff..1c15c66410e50 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodieJsonPayload.java @@ -54,11 +54,6 @@ public HoodieJsonPayload preCombine(HoodieJsonPayload another) { return this; } - @Override - public HoodieJsonPayload preCombine(HoodieJsonPayload another, Schema schema) throws IOException { - return this; - } - @Override public Option combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java index 9639683a80324..783422fc648f2 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/EmptyHoodieRecordPayload.java @@ -24,8 +24,6 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; -import java.io.IOException; - /** * Empty payload used for deletions. */ @@ -42,11 +40,6 @@ public EmptyHoodieRecordPayload preCombine(EmptyHoodieRecordPayload another) { return another; } - @Override - public EmptyHoodieRecordPayload preCombine(EmptyHoodieRecordPayload another, Schema schema) throws IOException { - return another; - } - @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) { return Option.empty(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java index ed454c821d689..a3ab2b71ae980 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieAvroPayload.java @@ -50,11 +50,6 @@ public HoodieAvroPayload preCombine(HoodieAvroPayload another) { return this; } - @Override - public HoodieAvroPayload preCombine(HoodieAvroPayload another, Schema schema) throws IOException { - return this; - } - @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java index fe529d61cfcbf..c8fe1d8a2872d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java @@ -41,24 +41,18 @@ public interface HoodieRecordPayload extends Seri * When more than one HoodieRecord have the same HoodieKey, this function combines them before attempting to * insert/upsert (if combining turned on in HoodieClientConfig). */ - @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) + @Deprecated T preCombine(T another); /** * When more than one HoodieRecord have the same HoodieKey, this function combines all fields(which is not null) * before attempting to insert/upsert (if combining turned on in HoodieClientConfig). - * eg: 1) - * Before: - * id name age ts - * 1 Karl null 0.0 - * 1 null 18 0.0 - * After: - * id name age ts - * 1 Karl 18 0.0 * */ @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE) - T preCombine(T another, Schema schema) throws IOException; + default T preCombine(T another, Schema schema) throws IOException { + return preCombine(another); + } /** * This methods lets you write custom merging/combining logic to produce new values as a function of current value on diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwritePrecombineAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwritePrecombineAvroPayload.java new file mode 100644 index 0000000000000..a13cdd5437e49 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwritePrecombineAvroPayload.java @@ -0,0 +1,66 @@ +package org.apache.hudi.common.model; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.util.Option; + +import java.io.IOException; +import java.util.List; + +/** + * subclass of OverwriteWithLatestAvroPayload used for delta streamer. + * + *

    + *
  1. preCombine - When more than one HoodieRecord have the same HoodieKey, this function combines all fields(which is not null) + * before attempting to insert/upsert. + * eg: 1) + * Before: + * id name age ts + * 1 Karl null 0.0 + * 1 null 18 0.0 + * After: + * id name age ts + * 1 Karl 18 0.0 + *
+ */ +public class OverwritePrecombineAvroPayload extends OverwriteWithLatestAvroPayload { + public OverwritePrecombineAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public OverwritePrecombineAvroPayload(Option record) { + super(record); + } + + @Override + public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another, Schema schema) throws IOException { + // pick the payload with greatest ordering value and aggregate all the fields,choosing the + // value that is not null + GenericRecord thisValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(this.recordBytes, schema); + GenericRecord anotherValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(another.recordBytes, schema); + List fields = schema.getFields(); + + if (another.orderingVal.compareTo(orderingVal) > 0) { + GenericRecord anotherRoc = combineAllFields(fields, anotherValue, thisValue); + another.recordBytes = HoodieAvroUtils.avroToBytes(anotherRoc); + return another; + } else { + GenericRecord thisRoc = combineAllFields(fields, thisValue, anotherValue); + this.recordBytes = HoodieAvroUtils.avroToBytes(thisRoc); + return this; + } + } + + public GenericRecord combineAllFields(List fields, GenericRecord priorRec, GenericRecord secPriorRoc) { + for (int i = 0; i < fields.size(); i++) { + Object priorValue = priorRec.get(fields.get(i).name()); + Object secPriorValue = secPriorRoc.get(fields.get(i).name()); + Object defaultVal = fields.get(i).defaultVal(); + if (overwriteField(priorValue, defaultVal) && !overwriteField(secPriorValue, defaultVal)) { + priorRec.put(fields.get(i).name(), secPriorValue); + } + } + return priorRec; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java index bab7223f7fd09..e1e61244bd01a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java @@ -26,7 +26,6 @@ import org.apache.avro.generic.IndexedRecord; import java.io.IOException; -import java.util.List; /** * Default payload used for delta streamer. @@ -47,26 +46,6 @@ public OverwriteWithLatestAvroPayload(Option record) { this(record.isPresent() ? record.get() : null, 0); // natural order } - @Override - public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another,Schema schema) throws IOException { - // pick the payload with greatest ordering value and aggregate all the fields,choosing the - // value that is not null - GenericRecord thisValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(this.recordBytes, schema); - GenericRecord anotherValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(another.recordBytes,schema); - List fields = schema.getFields(); - - if (another.orderingVal.compareTo(orderingVal) > 0) { - GenericRecord anotherRoc = combineAllFields(fields,anotherValue,thisValue); - another.recordBytes = HoodieAvroUtils.avroToBytes(anotherRoc); - return another; - } else { - GenericRecord thisRoc = combineAllFields(fields,thisValue,anotherValue); - this.recordBytes = HoodieAvroUtils.avroToBytes(thisRoc); - return this; - } - - } - @Override public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another) { // pick the payload with greatest ordering value @@ -77,18 +56,6 @@ public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload } } - public GenericRecord combineAllFields(List fields,GenericRecord priorRec,GenericRecord inferiorRoc) { - for (int i = 0; i < fields.size(); i++) { - Object priorValue = priorRec.get(fields.get(i).name()); - Object inferiorValue = inferiorRoc.get(fields.get(i).name()); - Object defaultVal = fields.get(i).defaultVal(); - if (overwriteField(priorValue,defaultVal) && !overwriteField(inferiorValue,defaultVal)) { - priorRec.put(fields.get(i).name(), inferiorValue); - } - } - return priorRec; - } - @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java new file mode 100644 index 0000000000000..c38e137c547bd --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java @@ -0,0 +1,84 @@ +package org.apache.hudi.common.model; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.common.util.Option; + +import java.io.IOException; +import java.util.List; + +/** + * subclass of OverwriteWithLatestAvroPayload used for delta streamer. + * + *
    + *
  1. Extract the features of OverwritePrecombineAvroPayload and OverwriteNonDefaultsWithLatestAvroPayload + *
+ */ +public class PartialAvroPayload extends OverwriteWithLatestAvroPayload { + public PartialAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public PartialAvroPayload(Option record) { + super(record); + } + + @Override + public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another, Schema schema) throws IOException { + // pick the payload with greatest ordering value and aggregate all the fields,choosing the + // value that is not null + GenericRecord thisValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(this.recordBytes, schema); + GenericRecord anotherValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(another.recordBytes, schema); + List fields = schema.getFields(); + + if (another.orderingVal.compareTo(orderingVal) > 0) { + GenericRecord anotherRoc = combineAllFields(fields, anotherValue, thisValue); + another.recordBytes = HoodieAvroUtils.avroToBytes(anotherRoc); + return another; + } else { + GenericRecord thisRoc = combineAllFields(fields, thisValue, anotherValue); + this.recordBytes = HoodieAvroUtils.avroToBytes(thisRoc); + return this; + } + } + + public GenericRecord combineAllFields(List fields, GenericRecord priorRec, GenericRecord secPriorRoc) { + for (int i = 0; i < fields.size(); i++) { + Object priorValue = priorRec.get(fields.get(i).name()); + Object secPriorValue = secPriorRoc.get(fields.get(i).name()); + Object defaultVal = fields.get(i).defaultVal(); + if (overwriteField(priorValue, defaultVal) && !overwriteField(secPriorValue, defaultVal)) { + priorRec.put(fields.get(i).name(), secPriorValue); + } + } + return priorRec; + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { + + Option recordOption = getInsertValue(schema); + if (!recordOption.isPresent()) { + return Option.empty(); + } + + GenericRecord insertRecord = (GenericRecord) recordOption.get(); + GenericRecord currentRecord = (GenericRecord) currentValue; + + if (isDeleteRecord(insertRecord)) { + return Option.empty(); + } else { + List fields = schema.getFields(); + fields.forEach(field -> { + Object value = insertRecord.get(field.name()); + Object defaultValue = field.defaultVal(); + if (!overwriteField(value, defaultValue)) { + currentRecord.put(field.name(), value); + } + }); + return Option.of(currentRecord); + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java new file mode 100644 index 0000000000000..4bd22b567915d --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java @@ -0,0 +1,66 @@ +package org.apache.hudi.common.model; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.hudi.avro.HoodieAvroUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.Arrays; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Unit tests {@link OverwritePrecombineAvroPayload}. + */ +public class TestOverwritePrecombineAvroPayload { + + private Schema schema; + + @BeforeEach + public void setUp() throws Exception { + schema = Schema.createRecord(Arrays.asList( + new Schema.Field("id", Schema.create(Schema.Type.STRING), "", null), + new Schema.Field("partition", Schema.create(Schema.Type.STRING), "", ""), + new Schema.Field("ts", Schema.create(Schema.Type.LONG), "", null), + new Schema.Field("_hoodie_is_deleted", Schema.create(Schema.Type.BOOLEAN), "", false) + )); + } + + @Test + public void testActiveRecords() throws IOException { + GenericRecord record1 = new GenericData.Record(schema); + record1.put("id", "1"); + record1.put("partition", "partition0"); + record1.put("ts", 0L); + record1.put("_hoodie_is_deleted", false); + + GenericRecord record2 = new GenericData.Record(schema); + record2.put("id", "2"); + record2.put("partition", ""); + record2.put("ts", 1L); + record2.put("_hoodie_is_deleted", false); + + GenericRecord record3 = new GenericData.Record(schema); + record3.put("id", "2"); + record3.put("partition", "partition0"); + record3.put("ts", 1L); + record3.put("_hoodie_is_deleted", false); + + OverwriteWithLatestAvroPayload payload1 = new OverwritePrecombineAvroPayload(record1, 1); + OverwriteWithLatestAvroPayload payload2 = new OverwritePrecombineAvroPayload(record2, 2); + assertEquals(payload1.preCombine(payload2), payload2); + assertEquals(payload2.preCombine(payload1), payload2); + + assertEquals(record1, payload1.getInsertValue(schema).get()); + assertEquals(record2, payload2.getInsertValue(schema).get()); + + assertEquals(payload1.combineAndGetUpdateValue(record2, schema).get(), record1); + assertEquals(payload2.combineAndGetUpdateValue(record1, schema).get(), record2); + + assertEquals(HoodieAvroUtils.bytesToAvro(payload1.preCombine(payload2, schema).recordBytes, schema), + record3); + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java index 204275c419601..7c5951a7cac04 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwriteWithLatestAvroPayload.java @@ -22,7 +22,6 @@ import org.apache.avro.Schema.Type; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; -import org.apache.hudi.avro.HoodieAvroUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -43,7 +42,7 @@ public class TestOverwriteWithLatestAvroPayload { public void setUp() throws Exception { schema = Schema.createRecord(Arrays.asList( new Schema.Field("id", Schema.create(Schema.Type.STRING), "", null), - new Schema.Field("partition", Schema.create(Schema.Type.STRING), "", ""), + new Schema.Field("partition", Schema.create(Schema.Type.STRING), "", null), new Schema.Field("ts", Schema.create(Schema.Type.LONG), "", null), new Schema.Field("_hoodie_is_deleted", Schema.create(Type.BOOLEAN), "", false) )); @@ -59,16 +58,10 @@ public void testActiveRecords() throws IOException { GenericRecord record2 = new GenericData.Record(schema); record2.put("id", "2"); - record2.put("partition", ""); + record2.put("partition", "partition1"); record2.put("ts", 1L); record2.put("_hoodie_is_deleted", false); - GenericRecord record3 = new GenericData.Record(schema); - record3.put("id", "2"); - record3.put("partition", "partition0"); - record3.put("ts", 1L); - record3.put("_hoodie_is_deleted", false); - OverwriteWithLatestAvroPayload payload1 = new OverwriteWithLatestAvroPayload(record1, 1); OverwriteWithLatestAvroPayload payload2 = new OverwriteWithLatestAvroPayload(record2, 2); assertEquals(payload1.preCombine(payload2), payload2); @@ -79,9 +72,6 @@ public void testActiveRecords() throws IOException { assertEquals(payload1.combineAndGetUpdateValue(record2, schema).get(), record1); assertEquals(payload2.combineAndGetUpdateValue(record1, schema).get(), record2); - - assertEquals(HoodieAvroUtils.bytesToAvro(payload1.preCombine(payload2, schema).recordBytes,schema), - record3); } @Test diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java index a45f196017a14..ff862ee7b7f7f 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/AvroBinaryTestPayload.java @@ -48,11 +48,6 @@ public HoodieRecordPayload preCombine(HoodieRecordPayload another) { return this; } - @Override - public HoodieRecordPayload preCombine(HoodieRecordPayload another, Schema schema) throws IOException { - return this; - } - @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { return getInsertValue(schema); diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java index d44c2d56fa634..8442aff084a49 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/RawTripTestPayload.java @@ -103,11 +103,6 @@ public RawTripTestPayload preCombine(RawTripTestPayload another) { return another; } - @Override - public RawTripTestPayload preCombine(RawTripTestPayload another, Schema schema) throws IOException { - return another; - } - @Override public Option combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { return this.getInsertValue(schema); From 2e3e799fe80983388a480195686aae370a7aa7ba Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Tue, 13 Oct 2020 18:01:49 +0800 Subject: [PATCH 13/24] update --- .../hudi/table/action/commit/SparkBulkInsertHelper.java | 2 +- .../hudi/client/TestHoodieClientOnCopyOnWriteStorage.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java index 375ced9c3427d..e0d60cd2e8a2c 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java @@ -73,7 +73,7 @@ public HoodieWriteMetadata> bulkInsert(JavaRDD>) SparkWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, - config.getBulkInsertShuffleParallelism(), table, false, null); + config.getBulkInsertShuffleParallelism(), table, null); } final JavaRDD> repartitionedRecords; diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java index 404c0fb6b9282..b599ee83da72e 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java @@ -240,14 +240,14 @@ private void testDeduplication( HoodieIndex index = mock(HoodieIndex.class); when(index.isGlobal()).thenReturn(true); List> dedupedRecs = SparkWriteHelper.newInstance().deduplicateRecords(records, index, 1, - false, null).collect(); + null).collect(); assertEquals(1, dedupedRecs.size()); assertNodupesWithinPartition(dedupedRecs); // non-Global dedup should be done based on both recordKey and partitionPath index = mock(HoodieIndex.class); when(index.isGlobal()).thenReturn(false); - dedupedRecs = SparkWriteHelper.newInstance().deduplicateRecords(records, index, 1, false, null).collect(); + dedupedRecs = SparkWriteHelper.newInstance().deduplicateRecords(records, index, 1, null).collect(); assertEquals(2, dedupedRecs.size()); assertNodupesWithinPartition(dedupedRecs); From a6464f6e9a45f3137af6afad6b363b23c4d01452 Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Tue, 13 Oct 2020 21:11:56 +0800 Subject: [PATCH 14/24] update --- .../model/OverwritePrecombineAvroPayload.java | 18 ++++++++++++++++++ .../hudi/common/model/PartialAvroPayload.java | 18 ++++++++++++++++++ .../TestOverwritePrecombineAvroPayload.java | 18 ++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwritePrecombineAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwritePrecombineAvroPayload.java index a13cdd5437e49..9211fea627f52 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwritePrecombineAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwritePrecombineAvroPayload.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.hudi.common.model; import org.apache.avro.Schema; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java index c38e137c547bd..8a98d3edca93a 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.hudi.common.model; import org.apache.avro.Schema; diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java index 4bd22b567915d..f182350074759 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.hudi.common.model; import org.apache.avro.Schema; From 02a2dfc5bbf106ade2be415e900cabc7dc025785 Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Wed, 14 Oct 2020 10:25:10 +0800 Subject: [PATCH 15/24] update --- .../org/apache/hudi/table/action/commit/SparkWriteHelper.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java index be27da3406e60..a49dd7696bf2d 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java @@ -60,8 +60,8 @@ public JavaRDD> deduplicateRecords(JavaRDD> reco return new Tuple2<>(key, record); }).reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") - T reducedData = !schema.get().isEmpty() ? (T) rec1.getData().preCombine(rec2.getData(), new Schema.Parser().parse(schema.get())) - : (T) rec1.getData().preCombine(rec2.getData()); + T reducedData = schema != null && !schema.get().isEmpty() ? (T) rec1.getData().preCombine(rec2.getData(), new Schema.Parser().parse(schema.get())) + : (T) rec1.getData().preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. From f1424204c90bd8416aa3168497b7eb3e63c76075 Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Thu, 22 Oct 2020 17:59:09 +0800 Subject: [PATCH 16/24] update --- .../action/commit/AbstractWriteHelper.java | 4 +- .../action/commit/SparkBulkInsertHelper.java | 2 +- .../SparkUpsertCommitActionExecutor.java | 4 +- .../table/action/commit/SparkWriteHelper.java | 68 ++++++++++--------- .../SparkUpsertDeltaCommitActionExecutor.java | 4 +- .../hudi/common/model/PartialAvroPayload.java | 43 ++---------- ....java => UpdatePrecombineAvroPayload.java} | 12 ++-- .../TestOverwritePrecombineAvroPayload.java | 6 +- 8 files changed, 58 insertions(+), 85 deletions(-) rename hudi-common/src/main/java/org/apache/hudi/common/model/{OverwritePrecombineAvroPayload.java => UpdatePrecombineAvroPayload.java} (83%) diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java index 1b97804fcb8e2..3d3b8dbdaebeb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java @@ -20,12 +20,12 @@ import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; -import scala.Option; import java.time.Duration; import java.time.Instant; @@ -41,7 +41,7 @@ public HoodieWriteMetadata write(String instantTime, BaseCommitActionExecutor executor, boolean performTagging) { return write(instantTime, inputRecords, context, table, shouldCombine, shuffleParallelism, - null, executor, performTagging); + Option.empty(), executor, performTagging); } public HoodieWriteMetadata write(String instantTime, diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java index e0d60cd2e8a2c..f84f024c3c6a8 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java @@ -73,7 +73,7 @@ public HoodieWriteMetadata> bulkInsert(JavaRDD>) SparkWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, - config.getBulkInsertShuffleParallelism(), table, null); + config.getBulkInsertShuffleParallelism(), table, Option.empty()); } final JavaRDD> repartitionedRecords; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java index 8cd8de34a272e..cd6012e7cbaac 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java @@ -23,12 +23,12 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.spark.api.java.JavaRDD; -import scala.Option; public class SparkUpsertCommitActionExecutor> extends BaseSparkCommitActionExecutor { @@ -46,6 +46,6 @@ public SparkUpsertCommitActionExecutor(HoodieSparkEngineContext context, public HoodieWriteMetadata> execute() { return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), - Option.apply(config.getSchema()), this, true); + Option.of(config.getSchema()), this, true); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java index a49dd7696bf2d..b77c7d91ce34f 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java @@ -23,11 +23,12 @@ import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; +import org.apache.hudi.common.model.UpdatePrecombineAvroPayload; +import org.apache.hudi.common.util.Option; import org.apache.hudi.index.HoodieIndex; import org.apache.spark.api.java.JavaRDD; -import scala.Option; import scala.Tuple2; /** @@ -35,38 +36,43 @@ * * @param */ -public class SparkWriteHelper extends AbstractWriteHelper>, - JavaRDD, JavaRDD, R> { - private SparkWriteHelper() { - } +public class SparkWriteHelper extends AbstractWriteHelper>, + JavaRDD, JavaRDD, R> { + private SparkWriteHelper() { + } - private static class WriteHelperHolder { - private static final SparkWriteHelper SPARK_WRITE_HELPER = new SparkWriteHelper(); - } + private static class WriteHelperHolder { + private static final SparkWriteHelper SPARK_WRITE_HELPER = new SparkWriteHelper(); + } - public static SparkWriteHelper newInstance() { - return WriteHelperHolder.SPARK_WRITE_HELPER; - } + public static SparkWriteHelper newInstance() { + return WriteHelperHolder.SPARK_WRITE_HELPER; + } - @Override - public JavaRDD> deduplicateRecords(JavaRDD> records, - HoodieIndex>, JavaRDD, JavaRDD> index, - int parallelism, Option schema) { - boolean isIndexingGlobal = index.isGlobal(); - return records.mapToPair(record -> { - HoodieKey hoodieKey = record.getKey(); - // If index used is global, then records are expected to differ in their partitionPath - Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; - return new Tuple2<>(key, record); - }).reduceByKey((rec1, rec2) -> { - @SuppressWarnings("unchecked") - T reducedData = schema != null && !schema.get().isEmpty() ? (T) rec1.getData().preCombine(rec2.getData(), new Schema.Parser().parse(schema.get())) - : (T) rec1.getData().preCombine(rec2.getData()); - // we cannot allow the user to change the key or partitionPath, since that will affect - // everything - // so pick it from one of the records. - return new HoodieRecord(rec1.getKey(), reducedData); - }, parallelism).map(Tuple2::_2); - } + @Override + public JavaRDD> deduplicateRecords(JavaRDD> records, + HoodieIndex>, JavaRDD, JavaRDD> index, + int parallelism, Option schema) { + boolean isIndexingGlobal = index.isGlobal(); + return records.mapToPair(record -> { + HoodieKey hoodieKey = record.getKey(); + // If index used is global, then records are expected to differ in their partitionPath + Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; + return new Tuple2<>(key, record); + }).reduceByKey((rec1, rec2) -> { + @SuppressWarnings("unchecked") + T reducedData; + if (rec2.getData() instanceof UpdatePrecombineAvroPayload) { + reducedData = schema.isPresent() ? (T) rec1.getData().preCombine(rec2.getData(), new Schema.Parser().parse(schema.get())) + : (T) rec1.getData().preCombine(rec2.getData()); + } else { + reducedData = (T) rec1.getData().preCombine(rec2.getData()); + } + // we cannot allow the user to change the key or partitionPath, since that will affect + // everything + // so pick it from one of the records. + return new HoodieRecord(rec1.getKey(), reducedData); + }, parallelism).map(Tuple2::_2); + } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java index 1edc0e3e3ff9a..9318c8aceae79 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java @@ -22,13 +22,13 @@ import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; +import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.commit.SparkWriteHelper; import org.apache.spark.api.java.JavaRDD; -import scala.Option; public class SparkUpsertDeltaCommitActionExecutor> extends AbstractSparkDeltaCommitActionExecutor { @@ -46,6 +46,6 @@ public SparkUpsertDeltaCommitActionExecutor(HoodieSparkEngineContext context, public HoodieWriteMetadata execute() { return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), - Option.apply(config.getSchema()), this, true); + Option.of(config.getSchema()), this, true); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java index 8a98d3edca93a..aefd4a9eb9c89 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/PartialAvroPayload.java @@ -21,20 +21,19 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; -import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.common.util.Option; import java.io.IOException; import java.util.List; /** - * subclass of OverwriteWithLatestAvroPayload used for delta streamer. + * subclass of OverwriteWithLatestAvroPayload. * - *
    - *
  1. Extract the features of OverwritePrecombineAvroPayload and OverwriteNonDefaultsWithLatestAvroPayload - *
+ * Extract the function precombine of UpdatePrecombineAvroPayload and combineAndGetUpdateValue of OverwriteNonDefaultsWithLatestAvroPayload. + * Which means When more than one HoodieRecord have the same HoodieKey, this function will combine all fields(which is not null) + * Before attempting to insert/upsert And when insert/upsert into storage. */ -public class PartialAvroPayload extends OverwriteWithLatestAvroPayload { +public class PartialAvroPayload extends UpdatePrecombineAvroPayload { public PartialAvroPayload(GenericRecord record, Comparable orderingVal) { super(record, orderingVal); } @@ -43,40 +42,8 @@ public PartialAvroPayload(Option record) { super(record); } - @Override - public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another, Schema schema) throws IOException { - // pick the payload with greatest ordering value and aggregate all the fields,choosing the - // value that is not null - GenericRecord thisValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(this.recordBytes, schema); - GenericRecord anotherValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(another.recordBytes, schema); - List fields = schema.getFields(); - - if (another.orderingVal.compareTo(orderingVal) > 0) { - GenericRecord anotherRoc = combineAllFields(fields, anotherValue, thisValue); - another.recordBytes = HoodieAvroUtils.avroToBytes(anotherRoc); - return another; - } else { - GenericRecord thisRoc = combineAllFields(fields, thisValue, anotherValue); - this.recordBytes = HoodieAvroUtils.avroToBytes(thisRoc); - return this; - } - } - - public GenericRecord combineAllFields(List fields, GenericRecord priorRec, GenericRecord secPriorRoc) { - for (int i = 0; i < fields.size(); i++) { - Object priorValue = priorRec.get(fields.get(i).name()); - Object secPriorValue = secPriorRoc.get(fields.get(i).name()); - Object defaultVal = fields.get(i).defaultVal(); - if (overwriteField(priorValue, defaultVal) && !overwriteField(secPriorValue, defaultVal)) { - priorRec.put(fields.get(i).name(), secPriorValue); - } - } - return priorRec; - } - @Override public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { - Option recordOption = getInsertValue(schema); if (!recordOption.isPresent()) { return Option.empty(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwritePrecombineAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/UpdatePrecombineAvroPayload.java similarity index 83% rename from hudi-common/src/main/java/org/apache/hudi/common/model/OverwritePrecombineAvroPayload.java rename to hudi-common/src/main/java/org/apache/hudi/common/model/UpdatePrecombineAvroPayload.java index 9211fea627f52..b93e046d12714 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwritePrecombineAvroPayload.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/UpdatePrecombineAvroPayload.java @@ -27,7 +27,7 @@ import java.util.List; /** - * subclass of OverwriteWithLatestAvroPayload used for delta streamer. + * subclass of OverwriteWithLatestAvroPayload. * *
    *
  1. preCombine - When more than one HoodieRecord have the same HoodieKey, this function combines all fields(which is not null) @@ -42,12 +42,12 @@ * 1 Karl 18 0.0 *
*/ -public class OverwritePrecombineAvroPayload extends OverwriteWithLatestAvroPayload { - public OverwritePrecombineAvroPayload(GenericRecord record, Comparable orderingVal) { +public class UpdatePrecombineAvroPayload extends OverwriteWithLatestAvroPayload { + public UpdatePrecombineAvroPayload(GenericRecord record, Comparable orderingVal) { super(record, orderingVal); } - public OverwritePrecombineAvroPayload(Option record) { + public UpdatePrecombineAvroPayload(Option record) { super(record); } @@ -55,8 +55,8 @@ public OverwritePrecombineAvroPayload(Option record) { public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload another, Schema schema) throws IOException { // pick the payload with greatest ordering value and aggregate all the fields,choosing the // value that is not null - GenericRecord thisValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(this.recordBytes, schema); - GenericRecord anotherValue = (GenericRecord) HoodieAvroUtils.bytesToAvro(another.recordBytes, schema); + GenericRecord thisValue = HoodieAvroUtils.bytesToAvro(this.recordBytes, schema); + GenericRecord anotherValue = HoodieAvroUtils.bytesToAvro(another.recordBytes, schema); List fields = schema.getFields(); if (another.orderingVal.compareTo(orderingVal) > 0) { diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java index f182350074759..eaccb4b82fa02 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/TestOverwritePrecombineAvroPayload.java @@ -31,7 +31,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; /** - * Unit tests {@link OverwritePrecombineAvroPayload}. + * Unit tests {@link UpdatePrecombineAvroPayload}. */ public class TestOverwritePrecombineAvroPayload { @@ -67,8 +67,8 @@ public void testActiveRecords() throws IOException { record3.put("ts", 1L); record3.put("_hoodie_is_deleted", false); - OverwriteWithLatestAvroPayload payload1 = new OverwritePrecombineAvroPayload(record1, 1); - OverwriteWithLatestAvroPayload payload2 = new OverwritePrecombineAvroPayload(record2, 2); + OverwriteWithLatestAvroPayload payload1 = new UpdatePrecombineAvroPayload(record1, 1); + OverwriteWithLatestAvroPayload payload2 = new UpdatePrecombineAvroPayload(record2, 2); assertEquals(payload1.preCombine(payload2), payload2); assertEquals(payload2.preCombine(payload1), payload2); From 1382b045f04a705d476b6fcb1d6f1600329fc0f3 Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Thu, 22 Oct 2020 23:00:16 +0800 Subject: [PATCH 17/24] update --- .../table/action/commit/SparkWriteHelper.java | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java index b77c7d91ce34f..3754a144aa97b 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java @@ -37,42 +37,42 @@ * @param */ public class SparkWriteHelper extends AbstractWriteHelper>, - JavaRDD, JavaRDD, R> { - private SparkWriteHelper() { - } + JavaRDD, JavaRDD, R> { + private SparkWriteHelper() { + } - private static class WriteHelperHolder { - private static final SparkWriteHelper SPARK_WRITE_HELPER = new SparkWriteHelper(); - } + private static class WriteHelperHolder { + private static final SparkWriteHelper SPARK_WRITE_HELPER = new SparkWriteHelper(); + } - public static SparkWriteHelper newInstance() { - return WriteHelperHolder.SPARK_WRITE_HELPER; - } + public static SparkWriteHelper newInstance() { + return WriteHelperHolder.SPARK_WRITE_HELPER; + } - @Override - public JavaRDD> deduplicateRecords(JavaRDD> records, - HoodieIndex>, JavaRDD, JavaRDD> index, - int parallelism, Option schema) { - boolean isIndexingGlobal = index.isGlobal(); - return records.mapToPair(record -> { - HoodieKey hoodieKey = record.getKey(); - // If index used is global, then records are expected to differ in their partitionPath - Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; - return new Tuple2<>(key, record); - }).reduceByKey((rec1, rec2) -> { - @SuppressWarnings("unchecked") - T reducedData; - if (rec2.getData() instanceof UpdatePrecombineAvroPayload) { - reducedData = schema.isPresent() ? (T) rec1.getData().preCombine(rec2.getData(), new Schema.Parser().parse(schema.get())) - : (T) rec1.getData().preCombine(rec2.getData()); - } else { - reducedData = (T) rec1.getData().preCombine(rec2.getData()); - } - // we cannot allow the user to change the key or partitionPath, since that will affect - // everything - // so pick it from one of the records. - return new HoodieRecord(rec1.getKey(), reducedData); - }, parallelism).map(Tuple2::_2); - } + @Override + public JavaRDD> deduplicateRecords(JavaRDD> records, + HoodieIndex>, JavaRDD, JavaRDD> index, + int parallelism, Option schema) { + boolean isIndexingGlobal = index.isGlobal(); + return records.mapToPair(record -> { + HoodieKey hoodieKey = record.getKey(); + // If index used is global, then records are expected to differ in their partitionPath + Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; + return new Tuple2<>(key, record); + }).reduceByKey((rec1, rec2) -> { + @SuppressWarnings("unchecked") + T reducedData; + if (rec2.getData() instanceof UpdatePrecombineAvroPayload) { + reducedData = schema.isPresent() ? (T) rec1.getData().preCombine(rec2.getData(), new Schema.Parser().parse(schema.get())) + : (T) rec1.getData().preCombine(rec2.getData()); + } else { + reducedData = (T) rec1.getData().preCombine(rec2.getData()); + } + // we cannot allow the user to change the key or partitionPath, since that will affect + // everything + // so pick it from one of the records. + return new HoodieRecord(rec1.getKey(), reducedData); + }, parallelism).map(Tuple2::_2); + } } From cf1de28c07e9a74249e19c4f0b6f12aaa33a995e Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Fri, 23 Oct 2020 16:59:49 +0800 Subject: [PATCH 18/24] update --- .../org/apache/hudi/table/action/commit/SparkWriteHelper.java | 1 + 1 file changed, 1 insertion(+) diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java index 3754a144aa97b..f7312453d759a 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java @@ -62,6 +62,7 @@ public JavaRDD> deduplicateRecords(JavaRDD> reco }).reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") T reducedData; + //To prevent every records from parsing schema if (rec2.getData() instanceof UpdatePrecombineAvroPayload) { reducedData = schema.isPresent() ? (T) rec1.getData().preCombine(rec2.getData(), new Schema.Parser().parse(schema.get())) : (T) rec1.getData().preCombine(rec2.getData()); From a75f293ac43d84901466d5b58bb4558a044a7203 Mon Sep 17 00:00:00 2001 From: Karl-WangSK Date: Wed, 11 Nov 2020 12:16:01 +0800 Subject: [PATCH 19/24] add serializableSchema --- .../client/AbstractHoodieWriteClient.java | 4 +- .../org/apache/hudi/table/HoodieTable.java | 3 +- .../action/commit/AbstractWriteHelper.java | 12 +- .../hudi/client/SparkRDDWriteClient.java | 10 +- .../table/HoodieSparkCopyOnWriteTable.java | 7 +- .../table/HoodieSparkMergeOnReadTable.java | 8 +- .../action/commit/SparkBulkInsertHelper.java | 2 +- .../SparkUpsertCommitActionExecutor.java | 9 +- .../table/action/commit/SparkWriteHelper.java | 5 +- .../SparkUpsertDeltaCommitActionExecutor.java | 9 +- .../commit/TestCopyOnWriteActionExecutor.java | 4 +- .../common/config/SerializableSchema.java | 115 ++++++++++++++++++ .../apache/hudi/avro/TestHoodieAvroUtils.java | 2 +- .../java/org/apache/hudi/DataSourceUtils.java | 6 +- .../apache/hudi/HoodieSparkSqlWriter.scala | 4 +- .../org/apache/hudi/TestDataSourceUtils.java | 6 +- pom.xml | 68 +++++------ 17 files changed, 209 insertions(+), 65 deletions(-) create mode 100644 hudi-common/src/main/java/org/apache/hudi/common/config/SerializableSchema.java diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java index 222e1ab2ca5b2..c18fbe06de7b9 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java @@ -19,6 +19,7 @@ package org.apache.hudi.client; import com.codahale.metrics.Timer; +import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCompactionPlan; @@ -29,6 +30,7 @@ import org.apache.hudi.callback.util.HoodieCommitCallbackFactory; import org.apache.hudi.client.embedded.EmbeddedTimelineService; import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -259,7 +261,7 @@ protected void rollBackInflightBootstrap() { * @param instantTime Instant time of the commit * @return WriteStatus to inspect errors and counts */ - public abstract O upsert(I records, final String instantTime); + public abstract O upsert(I records, final String instantTime, Schema schema); /** * Upserts the given prepared records into the Hoodie table, at the supplied instantTime. diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java index afd1a36649409..a266a959173df 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -31,6 +31,7 @@ import org.apache.hudi.client.common.TaskContextSupplier; import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.common.config.SerializableConfiguration; +import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.fs.ConsistencyGuard; import org.apache.hudi.common.fs.ConsistencyGuard.FileVisibility; import org.apache.hudi.common.fs.ConsistencyGuardConfig; @@ -123,7 +124,7 @@ private synchronized FileSystemViewManager getViewManager() { * @return HoodieWriteMetadata */ public abstract HoodieWriteMetadata upsert(HoodieEngineContext context, String instantTime, - I records); + I records, Schema schema); /** * Insert a batch of new records into Hoodie table at the supplied instantTime. diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java index 3d3b8dbdaebeb..ae813622d40b4 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/action/commit/AbstractWriteHelper.java @@ -19,8 +19,8 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.common.HoodieEngineContext; +import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.model.HoodieRecordPayload; -import org.apache.hudi.common.util.Option; import org.apache.hudi.exception.HoodieUpsertException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.table.HoodieTable; @@ -41,7 +41,7 @@ public HoodieWriteMetadata write(String instantTime, BaseCommitActionExecutor executor, boolean performTagging) { return write(instantTime, inputRecords, context, table, shouldCombine, shuffleParallelism, - Option.empty(), executor, performTagging); + null, executor, performTagging); } public HoodieWriteMetadata write(String instantTime, @@ -50,7 +50,7 @@ public HoodieWriteMetadata write(String instantTime, HoodieTable table, boolean shouldCombine, int shuffleParallelism, - Option schema, + SerializableSchema schema, BaseCommitActionExecutor executor, boolean performTagging) { try { @@ -84,7 +84,7 @@ private I tag( } public I combineOnCondition( - boolean condition, I records, int parallelism, HoodieTable table, Option schema) { + boolean condition, I records, int parallelism, HoodieTable table, SerializableSchema schema) { return condition ? deduplicateRecords(records, table, parallelism, schema) : records; } @@ -96,10 +96,10 @@ public I combineOnCondition( * @return Collection of HoodieRecord already be deduplicated */ public I deduplicateRecords( - I records, HoodieTable table, int parallelism, Option schema) { + I records, HoodieTable table, int parallelism, SerializableSchema schema) { return deduplicateRecords(records, table.getIndex(), parallelism, schema); } public abstract I deduplicateRecords( - I records, HoodieIndex index, int parallelism, Option schema); + I records, HoodieIndex index, int parallelism, SerializableSchema schema); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java index 56f06898abba2..f836b19e7fed5 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/client/SparkRDDWriteClient.java @@ -18,9 +18,11 @@ package org.apache.hudi.client; +import org.apache.avro.Schema; import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.client.embedded.EmbeddedTimelineService; +import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -128,14 +130,18 @@ public void bootstrap(Option> extraMetadata) { getTableAndInitCtx(WriteOperationType.UPSERT, HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS).bootstrap(context, extraMetadata); } - @Override public JavaRDD upsert(JavaRDD> records, String instantTime) { + return upsert(records, instantTime, null); + } + + @Override + public JavaRDD upsert(JavaRDD> records, String instantTime, Schema schema) { HoodieTable>, JavaRDD, JavaRDD> table = getTableAndInitCtx(WriteOperationType.UPSERT, instantTime); table.validateUpsertSchema(); setOperationType(WriteOperationType.UPSERT); this.asyncCleanerService = AsyncCleanerService.startAsyncCleaningIfEnabled(this, instantTime); - HoodieWriteMetadata> result = table.upsert(context, instantTime, records); + HoodieWriteMetadata> result = table.upsert(context, instantTime, records, schema); if (result.getIndexLookupDuration().isPresent()) { metrics.updateIndexMetrics(LOOKUP_STR, result.getIndexLookupDuration().get().toMillis()); } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java index f2b336432b247..83095d01fa667 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkCopyOnWriteTable.java @@ -18,6 +18,7 @@ package org.apache.hudi.table; +import org.apache.avro.Schema; import org.apache.hudi.avro.model.HoodieCleanMetadata; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; @@ -26,6 +27,7 @@ import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -84,8 +86,9 @@ public HoodieSparkCopyOnWriteTable(HoodieWriteConfig config, HoodieEngineContext } @Override - public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, JavaRDD> records) { - return new SparkUpsertCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records).execute(); + public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, + JavaRDD> records, Schema schema) { + return new SparkUpsertCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records, schema).execute(); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java index 0a60dcc50f032..9731db99f0bcf 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/HoodieSparkMergeOnReadTable.java @@ -18,12 +18,14 @@ package org.apache.hudi.table; +import org.apache.avro.Schema; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRestoreMetadata; import org.apache.hudi.avro.model.HoodieRollbackMetadata; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieEngineContext; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -78,8 +80,10 @@ public class HoodieSparkMergeOnReadTable extends } @Override - public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, JavaRDD> records) { - return new SparkUpsertDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime, records).execute(); + public HoodieWriteMetadata> upsert(HoodieEngineContext context, String instantTime, + JavaRDD> records, Schema schema) { + return new SparkUpsertDeltaCommitActionExecutor<>((HoodieSparkEngineContext) context, config, this, instantTime + , records, schema).execute(); } @Override diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java index f84f024c3c6a8..e0d60cd2e8a2c 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBulkInsertHelper.java @@ -73,7 +73,7 @@ public HoodieWriteMetadata> bulkInsert(JavaRDD>) SparkWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, - config.getBulkInsertShuffleParallelism(), table, Option.empty()); + config.getBulkInsertShuffleParallelism(), table, null); } final JavaRDD> repartitionedRecords; diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java index cd6012e7cbaac..3078fc542e782 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java @@ -18,8 +18,10 @@ package org.apache.hudi.table.action.commit; +import org.apache.avro.Schema; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; @@ -34,18 +36,21 @@ public class SparkUpsertCommitActionExecutor> extends BaseSparkCommitActionExecutor { private JavaRDD> inputRecordsRDD; + private SerializableSchema schema; public SparkUpsertCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + String instantTime, JavaRDD> inputRecordsRDD, + Schema schema) { super(context, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; + this.schema = new SerializableSchema(schema); } @Override public HoodieWriteMetadata> execute() { return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), - Option.of(config.getSchema()), this, true); + schema, this, true); } } diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java index f7312453d759a..d0a685199cab4 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkWriteHelper.java @@ -20,6 +20,7 @@ import org.apache.avro.Schema; import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; @@ -52,7 +53,7 @@ public static SparkWriteHelper newInstance() { @Override public JavaRDD> deduplicateRecords(JavaRDD> records, HoodieIndex>, JavaRDD, JavaRDD> index, - int parallelism, Option schema) { + int parallelism, SerializableSchema schema) { boolean isIndexingGlobal = index.isGlobal(); return records.mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); @@ -64,7 +65,7 @@ public JavaRDD> deduplicateRecords(JavaRDD> reco T reducedData; //To prevent every records from parsing schema if (rec2.getData() instanceof UpdatePrecombineAvroPayload) { - reducedData = schema.isPresent() ? (T) rec1.getData().preCombine(rec2.getData(), new Schema.Parser().parse(schema.get())) + reducedData = schema.getSchema()!=null ? (T) rec1.getData().preCombine(rec2.getData(), schema.getSchema()) : (T) rec1.getData().preCombine(rec2.getData()); } else { reducedData = (T) rec1.getData().preCombine(rec2.getData()); diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java index 9318c8aceae79..b6de6158cd4ff 100644 --- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java @@ -18,7 +18,9 @@ package org.apache.hudi.table.action.deltacommit; +import org.apache.avro.Schema; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.WriteOperationType; @@ -34,18 +36,21 @@ public class SparkUpsertDeltaCommitActionExecutor { private JavaRDD> inputRecordsRDD; + private SerializableSchema schema; public SparkUpsertDeltaCommitActionExecutor(HoodieSparkEngineContext context, HoodieWriteConfig config, HoodieTable table, - String instantTime, JavaRDD> inputRecordsRDD) { + String instantTime, JavaRDD> inputRecordsRDD, + Schema schema) { super(context, config, table, instantTime, WriteOperationType.UPSERT); this.inputRecordsRDD = inputRecordsRDD; + this.schema = new SerializableSchema(schema); } @Override public HoodieWriteMetadata execute() { return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table, config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), - Option.of(config.getSchema()), this, true); + schema, this, true); } } diff --git a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java index 852f8029cccd6..079eaf4a81330 100644 --- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java +++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java @@ -385,7 +385,7 @@ public void testFileSizeUpsertRecords() throws Exception { // Insert new records BaseSparkCommitActionExecutor actionExecutor = new SparkUpsertCommitActionExecutor(context, config, table, - instantTime, jsc.parallelize(records)); + instantTime, jsc.parallelize(records), null); jsc.parallelize(Arrays.asList(1)) .map(i -> actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator())) .map(Transformations::flatten).collect(); @@ -426,7 +426,7 @@ public void testInsertUpsertWithHoodieAvroPayload() throws Exception { String partitionPath = writeStatus.getPartitionPath(); long numRecordsInPartition = updates.stream().filter(u -> u.getPartitionPath().equals(partitionPath)).count(); BaseSparkCommitActionExecutor newActionExecutor = new SparkUpsertCommitActionExecutor(context, config, table, - instantTime, jsc.parallelize(updates)); + instantTime, jsc.parallelize(updates), null); final List> updateStatus = jsc.parallelize(Arrays.asList(1)).map(x -> { return newActionExecutor.handleUpdate(partitionPath, fileId, updates.iterator()); }).map(Transformations::flatten).collect(); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/SerializableSchema.java b/hudi-common/src/main/java/org/apache/hudi/common/config/SerializableSchema.java new file mode 100644 index 0000000000000..c03ce2e4647c5 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/SerializableSchema.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.config; + +import org.apache.avro.AvroRuntimeException; +import org.apache.avro.Schema; +import org.apache.hadoop.io.WritableUtils; + +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +/** + * A wrapped schema which can be serialized. + */ +public class SerializableSchema implements Serializable { + private static final long serialVersionUID = -3281148111709753816L; + private transient Schema schema; + + public SerializableSchema(Schema schema) { + this.schema = schema; + } + + private void writeObject(ObjectOutputStream out) throws IOException { + out.defaultWriteObject(); + WritableUtils.writeVInt(out, 0); + if (schema!=null) { + List fields = schema.getFields(); + WritableUtils.writeVInt(out, fields.size()); + for (Schema.Field field : fields) { + org.apache.hadoop.io.Text.writeString(out, field.name()); + org.apache.hadoop.io.Text.writeString(out, castSchemaType(field.schema())); + org.apache.hadoop.io.Text.writeString(out, field.doc() == null ? "" : field.doc()); + out.writeObject(field.defaultVal()); + } + } + } + + private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { + int size = WritableUtils.readVInt(in); + if (size!=0){ + ArrayList fields = new ArrayList<>(); + for (int i = 0; i < size; ++i) { + String name = org.apache.hadoop.io.Text.readString(in); + Schema value = castSchemaType(org.apache.hadoop.io.Text.readString(in)); + String doc = org.apache.hadoop.io.Text.readString(in); + Object defaultValue = in.readObject(); + fields.add(new Schema.Field(name, value, doc, defaultValue)); + } + this.schema = Schema.createRecord(fields); + }else { + schema = null; + } + } + + private String castSchemaType(Schema type) { + return type.getType().getName(); + } + + private Schema castSchemaType(String type) { + switch (type) { + case "string": + return Schema.create(Schema.Type.STRING); + case "bytes": + return Schema.create(Schema.Type.BYTES); + case "int": + return Schema.create(Schema.Type.INT); + case "long": + return Schema.create(Schema.Type.LONG); + case "float": + return Schema.create(Schema.Type.FLOAT); + case "double": + return Schema.create(Schema.Type.DOUBLE); + case "boolean": + return Schema.create(Schema.Type.BOOLEAN); + case "null": + return Schema.create(Schema.Type.NULL); + default: + throw new AvroRuntimeException("Can't create a: " + type); + } + } + + @Override + public String toString() { + return schema.toString(); + } + + public Schema getSchema() { + return schema; + } + + public void setSchema(Schema schema) { + this.schema = schema; + } + +} diff --git a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java b/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java index 7d5cf0408b1ee..19763a7211ae4 100644 --- a/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/avro/TestHoodieAvroUtils.java @@ -182,7 +182,7 @@ public void testJsonNodeNullWithDefaultValues() { Schema.Field evolvedField1 = new Schema.Field("key", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); Schema.Field evolvedField2 = new Schema.Field("key1", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); Schema.Field evolvedField3 = new Schema.Field("key2", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); - Schema.Field evolvedField4 = new Schema.Field("evolved_field", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); + Schema.Field evolvedField4 = new Schema.Field("evolved_field", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", null); Schema.Field evolvedField5 = new Schema.Field("evolved_field1", HoodieAvroUtils.METADATA_FIELD_SCHEMA, "", JsonProperties.NULL_VALUE); evolvedFields.add(evolvedField1); evolvedFields.add(evolvedField2); diff --git a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java index 0b16c3149b6eb..5b6318aa0b34f 100644 --- a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java +++ b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java @@ -18,6 +18,7 @@ package org.apache.hudi; +import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -25,6 +26,7 @@ import org.apache.hudi.client.HoodieWriteResult; import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.common.HoodieSparkEngineContext; +import org.apache.hudi.common.config.SerializableSchema; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; @@ -199,7 +201,7 @@ public static String getCommitActionType(WriteOperationType operation, HoodieTab } public static HoodieWriteResult doWriteOperation(SparkRDDWriteClient client, JavaRDD hoodieRecords, - String instantTime, WriteOperationType operation) throws HoodieException { + String instantTime, WriteOperationType operation, Schema schema) throws HoodieException { switch (operation) { case BULK_INSERT: Option userDefinedBulkInsertPartitioner = @@ -208,7 +210,7 @@ public static HoodieWriteResult doWriteOperation(SparkRDDWriteClient client, Jav case INSERT: return new HoodieWriteResult(client.insert(hoodieRecords, instantTime)); case UPSERT: - return new HoodieWriteResult(client.upsert(hoodieRecords, instantTime)); + return new HoodieWriteResult(client.upsert(hoodieRecords, instantTime, schema)); case INSERT_OVERWRITE: return client.insertOverwrite(hoodieRecords, instantTime); default: diff --git a/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index b9731df8b76d7..e2b1c3df4d8c6 100644 --- a/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -29,7 +29,7 @@ import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.client.{SparkRDDWriteClient, HoodieWriteResult} import org.apache.hudi.client.{SparkRDDWriteClient, WriteStatus} -import org.apache.hudi.common.config.TypedProperties +import org.apache.hudi.common.config.{SerializableSchema, TypedProperties} import org.apache.hudi.common.model.{HoodieRecordPayload, HoodieTableType, WriteOperationType} import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.table.timeline.HoodieActiveTimeline @@ -176,7 +176,7 @@ private[hudi] object HoodieSparkSqlWriter { (true, common.util.Option.empty()) } client.startCommitWithTime(instantTime, commitActionType) - val writeResult = DataSourceUtils.doWriteOperation(client, hoodieRecords, instantTime, operation) + val writeResult = DataSourceUtils.doWriteOperation(client, hoodieRecords, instantTime, operation, schema) (writeResult, client) } else { val structName = s"${tblName}_record" diff --git a/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java b/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java index 97948b9ee3176..3c2db7ea86caa 100644 --- a/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java +++ b/hudi-spark/src/test/java/org/apache/hudi/TestDataSourceUtils.java @@ -131,7 +131,7 @@ public void testDoWriteOperationWithoutUserDefinedBulkInsertPartitioner() throws when(hoodieWriteClient.getConfig()).thenReturn(config); DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT); + WriteOperationType.BULK_INSERT, null); verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), optionCaptor.capture()); @@ -144,7 +144,7 @@ public void testDoWriteOperationWithNonExistUserDefinedBulkInsertPartitioner() t Exception exception = assertThrows(HoodieException.class, () -> { DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT); + WriteOperationType.BULK_INSERT, null); }); assertThat(exception.getMessage(), containsString("Could not create UserDefinedBulkInsertPartitioner")); @@ -155,7 +155,7 @@ public void testDoWriteOperationWithUserDefinedBulkInsertPartitioner() throws Ho setAndVerifyHoodieWriteClientWith(NoOpBulkInsertPartitioner.class.getName()); DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", - WriteOperationType.BULK_INSERT); + WriteOperationType.BULK_INSERT, null); verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), optionCaptor.capture()); diff --git a/pom.xml b/pom.xml index 2c786cfad81bc..1101809f044a9 100644 --- a/pom.xml +++ b/pom.xml @@ -153,40 +153,40 @@ - - org.apache.maven.plugins - maven-checkstyle-plugin - 3.0.0 - - - com.puppycrawl.tools - checkstyle - 8.18 - - - - true - UTF-8 - style/checkstyle.xml - style/checkstyle-suppressions.xml - checkstyle.suppressions.file - true - warning - true - - ${project.build.sourceDirectory} - - **\/generated-sources\/ - - - - compile - - check - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + org.apache.maven.plugins + maven-checkstyle-plugin + 3.0.0 + + + com.puppycrawl.tools + checkstyle + 8.18 + + + + true + UTF-8 + style/checkstyle.xml + style/checkstyle-suppressions.xml + checkstyle.suppressions.file + true + warning + true + + ${project.build.sourceDirectory} + + **\/generated-sources\/ + + + + compile + + check + + + +