apache · stayrascal · Jan 30, 2022 · Feb 6, 2022 · Feb 6, 2022 · Feb 7, 2022
diff --git a/...hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java b/...hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkWriteHelper.java
@@ -105,7 +105,7 @@ public List<HoodieRecord<T>> deduplicateRecords(
       // we cannot allow the user to change the key or partitionPath, since that will affect
       // everything
       // so pick it from one of the records.
-      boolean choosePrev = data1.equals(reducedData);
+      boolean choosePrev = data2.compareTo(data1) < 0;
       HoodieKey reducedKey = choosePrev ? rec1.getKey() : rec2.getKey();
       HoodieOperation operation = choosePrev ? rec1.getOperation() : rec2.getOperation();
       HoodieRecord<T> hoodieRecord = new HoodieAvroRecord<>(reducedKey, reducedData, operation);

diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordLocation.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordLocation.java
@@ -78,4 +78,8 @@ public String getFileId() {
   public void setFileId(String fileId) {
     this.fileId = fileId;
   }
+
+  public HoodieRecordLocation toLocal(String instantTime) {
+    return new HoodieRecordLocation(instantTime, fileId);
+  }
 }
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/HoodieRecordPayload.java
@@ -58,6 +58,18 @@ default T preCombine(T oldValue, Properties properties) {
     return preCombine(oldValue);
   }
 
+  /**
+   * When more than one HoodieRecord have the same HoodieKey in the incoming batch, and get the merged result after calling preCombine method instead of choose one of two records,
+   * can call this method to get the order among combined record with previous records
+   * @param oldValue instance of the old {@link HoodieRecordPayload} to be compare.
+   * @return a negative integer, zero, or a positive integer as this object is less than, equal to, or greater than the specified object.
+   *
+   */
+  @PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)
+  default int compareTo(T oldValue) {
+    return 0;
+  }
+
   /**
    * This methods is deprecated. Please refer to {@link #combineAndGetUpdateValue(IndexedRecord, Schema, Properties)} for java docs.
    */

diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/OverwriteWithLatestAvroPayload.java
@@ -40,8 +40,18 @@
 public class OverwriteWithLatestAvroPayload extends BaseAvroPayload
     implements HoodieRecordPayload<OverwriteWithLatestAvroPayload> {
 
+  /**
+   * the schema of generic record
+   */
+  public final String schema;
+
   public OverwriteWithLatestAvroPayload(GenericRecord record, Comparable orderingVal) {
+    this(record, orderingVal, null);
+  }
+
+  public OverwriteWithLatestAvroPayload(GenericRecord record, Comparable orderingVal, String schema) {
     super(record, orderingVal);
+    this.schema = schema;
   }
 
   public OverwriteWithLatestAvroPayload(Option<GenericRecord> record) {

diff --git a/...mon/src/main/java/org/apache/hudi/common/model/PartialOverwriteWithLatestAvroPayload.java b/...mon/src/main/java/org/apache/hudi/common/model/PartialOverwriteWithLatestAvroPayload.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.model;
+
+import org.apache.hudi.common.util.Option;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.generic.IndexedRecord;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Objects;
+
+import static org.apache.hudi.avro.HoodieAvroUtils.bytesToAvro;
+
+/**
+ * The only difference with {@link OverwriteNonDefaultsWithLatestAvroPayload} is that it supports
+ * merging the latest non-null partial fields with the old record instead of replacing the whole record.
+ * And merging the non-null fields during preCombine multiple records with same record key instead of choosing the latest record based on ordering field.
+ *
+ * <p> Regarding #combineAndGetUpdateValue, Assuming a {@link GenericRecord} has row schema: (f0 int , f1 int, f2 int).
+ * The first record value is: (1, 2, 3), the second record value is: (4, 5, null) with the field f2 value as null.
+ * Calling the #combineAndGetUpdateValue method of the two records returns record: (4, 5, 3).
+ * Note that field f2 value is ignored because it is null. </p>
+ *
+ * <p> Regarding #preCombine, Assuming a {@link GenericRecord} has row schema: (f0 int , f1 int, f2 int, o1 int),
+ * and initial two {@link PartialOverwriteWithLatestAvroPayload} with different ordering value.
+ * The first record value is (1, null, 1, 1) with the filed f1 value as null, the second value is: (2, 2, null, 2) with the f2 value as null.
+ * Calling the #preCombine method of the two records returns record: (2, 2, 1, 2).
+ * Note:
+ * <ol>
+ *   <li>the field f0 value is 2 because the ordering value of second record is bigger.</li>
+ *   <li>the filed f1 value is 2 because the f2 value of first record is null.</li>
+ *   <li>the filed f2 value is 1 because the f2 value of second record is null.</li>
+ *   <li>the filed o1 value is 2 because the ordering value of second record is bigger.</li>
+ * </ol>
+ *
+ * </p>
+ */
+public class PartialOverwriteWithLatestAvroPayload extends OverwriteWithLatestAvroPayload {
+
+  public PartialOverwriteWithLatestAvroPayload(GenericRecord record, Comparable orderingVal) {
+    this(record, orderingVal, null);
+  }
+
+  public PartialOverwriteWithLatestAvroPayload(GenericRecord record, Comparable orderingVal, String schema) {
+    super(record, orderingVal, schema);
+  }
+
+  public PartialOverwriteWithLatestAvroPayload(Option<GenericRecord> record) {
+    super(record); // natural order
+  }
+
+  @Override
+  public Option<IndexedRecord> combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException {
+    if (recordBytes.length == 0) {
+      return Option.empty();
+    }
+
+    GenericRecord incomingRecord = bytesToAvro(recordBytes, schema);
+    if (isDeleteRecord(incomingRecord)) {
+      return Option.empty();
+    }
+
+    GenericRecord currentRecord = (GenericRecord) currentValue;
+    List<Schema.Field> fields = schema.getFields();
+    fields.forEach(field -> {
+      Object value = incomingRecord.get(field.name());
+      if (Objects.nonNull(value)) {
+        currentRecord.put(field.name(), value);
+      }
+    });
+
+    return Option.of(currentRecord);
+  }
+
+  @Override
+  public int compareTo(OverwriteWithLatestAvroPayload oldValue) {
+    return orderingVal.compareTo(oldValue.orderingVal);
+  }
+
+  @Override
+  public OverwriteWithLatestAvroPayload preCombine(OverwriteWithLatestAvroPayload oldValue) {
+    if (null == this.schema || null == oldValue.schema) {
+      return super.preCombine(oldValue);
+    }
+
+    try {
+      Schema schema = new Schema.Parser().parse(this.schema);
+      Option<IndexedRecord> incomingOption = getInsertValue(new Schema.Parser().parse(this.schema));
+      Option<IndexedRecord> insertRecordOption = oldValue.getInsertValue(new Schema.Parser().parse(oldValue.schema));
+
+      if (incomingOption.isPresent() && insertRecordOption.isPresent()) {
+        GenericRecord currentRecord = (GenericRecord) incomingOption.get();
+        GenericRecord insertRecord = (GenericRecord) insertRecordOption.get();
+        boolean chooseCurrent = this.orderingVal.compareTo(oldValue.orderingVal) > 0;
+
+        if (!isDeleteRecord(insertRecord) && !isDeleteRecord(currentRecord)) {
+          schema.getFields().forEach(field -> {
+            Object insertValue = insertRecord.get(field.name());
+            Object currentValue = currentRecord.get(field.name());
+            currentRecord.put(field.name(), mergeValue(currentValue, insertValue, chooseCurrent));
+          });
+          return new PartialOverwriteWithLatestAvroPayload(currentRecord, chooseCurrent ? this.orderingVal : oldValue.orderingVal, this.schema);
+        } else {
+          return isDeleteRecord(insertRecord) ? this : oldValue;
+        }
+      } else {
+        return insertRecordOption.isPresent() ? oldValue : this;
+      }
+    } catch (IOException e) {
+      return super.preCombine(oldValue);
+    }
+  }
+
+  private Object mergeValue(Object left, Object right, Boolean chooseLeft) {
+    if (null != left && null != right) {
+      return chooseLeft ? left : right;
+    } else {
+      return null == left ? right : left;
+    }
+  }
+
+}
diff --git a/...src/test/java/org/apache/hudi/common/model/PartialOverwriteWithLatestAvroPayloadTest.java b/...src/test/java/org/apache/hudi/common/model/PartialOverwriteWithLatestAvroPayloadTest.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.model;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+class PartialOverwriteWithLatestAvroPayloadTest {
+  private Schema schema;
+
+  @BeforeEach
+  public void setUp() throws Exception {
+    schema = Schema.createRecord("record", null, null, false, Arrays.asList(
+        new Schema.Field("id", Schema.create(Schema.Type.STRING), "", null),
+        new Schema.Field("partition", Schema.createUnion(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.NULL)), "", ""),
+        new Schema.Field("ts", Schema.create(Schema.Type.LONG), "", null),
+        new Schema.Field("_hoodie_is_deleted", Schema.create(Schema.Type.BOOLEAN), "", false),
+        new Schema.Field("city", Schema.createUnion(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.NULL)), "", null),
+        new Schema.Field("child", Schema.createArray(Schema.create(Schema.Type.STRING)), "", Collections.emptyList())
+    ));
+  }
+
+  @Test
+  public void testActiveRecordsWithoutSchema() throws IOException {
+    GenericRecord record1 = new GenericData.Record(schema);
+    record1.put("id", "1");
+    record1.put("partition", "partition1");
+    record1.put("ts", 0L);
+    record1.put("_hoodie_is_deleted", false);
+    record1.put("city", "NY0");
+    record1.put("child", Arrays.asList("A"));
+
+    GenericRecord record2 = new GenericData.Record(schema);
+    record2.put("id", "2");
+    record2.put("partition", "");
+    record2.put("ts", 1L);
+    record2.put("_hoodie_is_deleted", false);
+    record2.put("city", "NY");
+    record2.put("child", Collections.emptyList());
+
+    GenericRecord record3 = new GenericData.Record(schema);
+    record3.put("id", "2");
+    record3.put("partition", "");
+    record3.put("ts", 1L);
+    record3.put("_hoodie_is_deleted", false);
+    record3.put("city", "NY");
+    record3.put("child", Arrays.asList("A"));
+
+
+    PartialOverwriteWithLatestAvroPayload payload1 = new PartialOverwriteWithLatestAvroPayload(record1, 1);
+    PartialOverwriteWithLatestAvroPayload payload2 = new PartialOverwriteWithLatestAvroPayload(record2, 2);
+    assertEquals(payload1.preCombine(payload2), payload2);
+    assertEquals(payload2.preCombine(payload1), payload2);
+
+    assertEquals(record1, payload1.getInsertValue(schema).get());
+    assertEquals(record2, payload2.getInsertValue(schema).get());
+
+    assertEquals(payload1.combineAndGetUpdateValue(record2, schema).get(), record1);
+    assertEquals(payload2.combineAndGetUpdateValue(record1, schema).get(), record3);
+  }
+
+  @Test
+  public void testCompareFunction() {
+    GenericRecord record = new GenericData.Record(schema);
+    record.put("id", "1");
+    record.put("partition", "partition1");
+    record.put("ts", 0L);
+    record.put("_hoodie_is_deleted", false);
+    record.put("city", "NY0");
+    record.put("child", Arrays.asList("A"));
+
+    PartialOverwriteWithLatestAvroPayload payload1 = new PartialOverwriteWithLatestAvroPayload(record, 1);
+    PartialOverwriteWithLatestAvroPayload payload2 = new PartialOverwriteWithLatestAvroPayload(record, 2);
+
+    assertEquals(payload1.compareTo(payload2), -1);
+    assertEquals(payload2.compareTo(payload1), 1);
+    assertEquals(payload1.compareTo(payload1), 0);
+  }
+
+  @Test
+  public void testActiveRecordsWithSchema() throws IOException {
+    GenericRecord record1 = new GenericData.Record(schema);
+    record1.put("id", "1");
+    record1.put("partition", "partition1");
+    record1.put("ts", 0L);
+    record1.put("_hoodie_is_deleted", false);
+    record1.put("city", null);
+    record1.put("child", Arrays.asList("A"));
+
+    GenericRecord record2 = new GenericData.Record(schema);
+    record2.put("id", "2");
+    record2.put("partition", null);
+    record2.put("ts", 1L);
+    record2.put("_hoodie_is_deleted", false);
+    record2.put("city", "NY");
+    record2.put("child", Collections.emptyList());
+
+    GenericRecord expectedRecord = new GenericData.Record(schema);
+    expectedRecord.put("id", "2");
+    expectedRecord.put("partition", "partition1");
+    expectedRecord.put("ts", 1L);
+    expectedRecord.put("_hoodie_is_deleted", false);
+    expectedRecord.put("city", "NY");
+    expectedRecord.put("child", Collections.emptyList());
+
+
+    PartialOverwriteWithLatestAvroPayload payload1 = new PartialOverwriteWithLatestAvroPayload(record1, 1, schema.toString());
+    PartialOverwriteWithLatestAvroPayload payload2 = new PartialOverwriteWithLatestAvroPayload(record2, 2, schema.toString());
+    PartialOverwriteWithLatestAvroPayload expectedPayload = new PartialOverwriteWithLatestAvroPayload(expectedRecord, 2, schema.toString());
+    assertArrayEquals(payload1.preCombine(payload2).recordBytes, expectedPayload.recordBytes);
+    assertArrayEquals(payload2.preCombine(payload1).recordBytes, expectedPayload.recordBytes);
+    assertEquals(payload1.preCombine(payload2).orderingVal, expectedPayload.orderingVal);
+    assertEquals(payload2.preCombine(payload1).orderingVal, expectedPayload.orderingVal);
+  }
+
+  @Test
+  public void testDeletedRecord() throws IOException {
+    GenericRecord record1 = new GenericData.Record(schema);
+    record1.put("id", "1");
+    record1.put("partition", "partition0");
+    record1.put("ts", 0L);
+    record1.put("_hoodie_is_deleted", false);
+    record1.put("city", "NY0");
+    record1.put("child", Collections.emptyList());
+
+    GenericRecord delRecord1 = new GenericData.Record(schema);
+    delRecord1.put("id", "2");
+    delRecord1.put("partition", "partition1");
+    delRecord1.put("ts", 1L);
+    delRecord1.put("_hoodie_is_deleted", true);
+    delRecord1.put("city", "NY0");
+    delRecord1.put("child", Collections.emptyList());
+
+    GenericRecord record2 = new GenericData.Record(schema);
+    record2.put("id", "1");
+    record2.put("partition", "partition0");
+    record2.put("ts", 0L);
+    record2.put("_hoodie_is_deleted", true);
+    record2.put("city", "NY0");
+    record2.put("child", Collections.emptyList());
+
+    PartialOverwriteWithLatestAvroPayload payload1 = new PartialOverwriteWithLatestAvroPayload(record1, 1, schema.toString());
+    PartialOverwriteWithLatestAvroPayload payload2 = new PartialOverwriteWithLatestAvroPayload(delRecord1, 2, schema.toString());
+
+    assertEquals(payload1.preCombine(payload2), payload1);
+    assertEquals(payload2.preCombine(payload1), payload1);
+  }
+
+}
diff --git a/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java b/hudi-flink/src/main/java/org/apache/hudi/configuration/FlinkOptions.java
@@ -268,6 +268,12 @@ private FlinkOptions() {
       .withDescription("Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting.\n"
           + "This will render any value set for the option in-effective");
 
+  public static final ConfigOption<Boolean> PARTIAL_OVERWRITE_ENABLED = ConfigOptions
+      .key("partial.overwrite.enabled")
+      .booleanType()
+      .defaultValue(false)
+      .withDescription("Partial overwrite payload, the write.payload.class should be org.apache.hudi.common.model.PartialOverwriteWithLatestAvroPayload when it is true");
+
   /**
    * Flag to indicate whether to drop duplicates before insert/upsert.
    * By default false to gain extra performance.