diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/AbstractDebeziumAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/AbstractDebeziumAvroPayload.java
new file mode 100644
index 0000000000000..33f1d9f0025b2
--- /dev/null
+++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/AbstractDebeziumAvroPayload.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.model.debezium;
+
+import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
+import org.apache.hudi.common.util.Option;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+
+/**
+ * Base class that provides support for seamlessly applying changes captured via Debezium.
+ *
+ * Debezium change event types are determined for the op field in the payload
+ *
+ * - For inserts, op=i
+ * - For deletes, op=d
+ * - For updates, op=u
+ * - For snapshort inserts, op=r
+ *
+ * This payload implementation will issue matching insert, delete, updates against the hudi table
+ */
+public abstract class AbstractDebeziumAvroPayload extends OverwriteWithLatestAvroPayload {
+
+ private static final Logger LOG = LogManager.getLogger(AbstractDebeziumAvroPayload.class);
+
+ public AbstractDebeziumAvroPayload(GenericRecord record, Comparable orderingVal) {
+ super(record, orderingVal);
+ }
+
+ public AbstractDebeziumAvroPayload(Option record) {
+ super(record);
+ }
+
+ @Override
+ public Option getInsertValue(Schema schema) throws IOException {
+ IndexedRecord insertRecord = getInsertRecord(schema);
+ return handleDeleteOperation(insertRecord);
+ }
+
+ @Override
+ public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException {
+ // Step 1: If the time occurrence of the current record in storage is higher than the time occurrence of the
+ // insert record (including a delete record), pick the current record.
+ if (shouldPickCurrentRecord(currentValue, getInsertRecord(schema), schema)) {
+ return Option.of(currentValue);
+ }
+ // Step 2: Pick the insert record (as a delete record if its a deleted event)
+ return getInsertValue(schema);
+ }
+
+ protected abstract boolean shouldPickCurrentRecord(IndexedRecord currentRecord, IndexedRecord insertRecord, Schema schema) throws IOException;
+
+ private Option handleDeleteOperation(IndexedRecord insertRecord) {
+ boolean delete = false;
+ if (insertRecord instanceof GenericRecord) {
+ GenericRecord record = (GenericRecord) insertRecord;
+ Object value = record.get(DebeziumConstants.FLATTENED_OP_COL_NAME);
+ delete = value != null && value.toString().equalsIgnoreCase(DebeziumConstants.DELETE_OP);
+ }
+
+ return delete ? Option.empty() : Option.of(insertRecord);
+ }
+
+ private IndexedRecord getInsertRecord(Schema schema) throws IOException {
+ return super.getInsertValue(schema).get();
+ }
+}
\ No newline at end of file
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/DebeziumConstants.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/DebeziumConstants.java
new file mode 100644
index 0000000000000..d3e115e3d9d8a
--- /dev/null
+++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/DebeziumConstants.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.model.debezium;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Constants used by {@link DebeziumSource} and {@link DebeziumAvroPayload}.
+ */
+public class DebeziumConstants {
+
+ // INPUT COLUMNS
+ public static final String INCOMING_BEFORE_FIELD = "before";
+ public static final String INCOMING_AFTER_FIELD = "after";
+ public static final String INCOMING_SOURCE_FIELD = "source";
+ public static final String INCOMING_OP_FIELD = "op";
+ public static final String INCOMING_TS_MS_FIELD = "ts_ms";
+
+ public static final String INCOMING_SOURCE_NAME_FIELD = "source.name";
+ public static final String INCOMING_SOURCE_TS_MS_FIELD = "source.ts_ms";
+ public static final String INCOMING_SOURCE_TXID_FIELD = "source.txId";
+
+ // INPUT COLUMNS SPECIFIC TO MYSQL
+ public static final String INCOMING_SOURCE_FILE_FIELD = "source.file";
+ public static final String INCOMING_SOURCE_POS_FIELD = "source.pos";
+ public static final String INCOMING_SOURCE_ROW_FIELD = "source.row";
+
+ // INPUT COLUMNS SPECIFIC TO POSTGRES
+ public static final String INCOMING_SOURCE_LSN_FIELD = "source.lsn";
+ public static final String INCOMING_SOURCE_XMIN_FIELD = "source.xmin";
+
+ // OUTPUT COLUMNS
+ public static final String FLATTENED_OP_COL_NAME = "_change_operation_type";
+ public static final String UPSTREAM_PROCESSING_TS_COL_NAME = "_upstream_event_processed_ts_ms";
+ public static final String FLATTENED_SHARD_NAME = "db_shard_source_partition";
+ public static final String FLATTENED_TS_COL_NAME = "_event_origin_ts_ms";
+ public static final String FLATTENED_TX_ID_COL_NAME = "_event_tx_id";
+
+ // OUTPUT COLUMNS SPECIFIC TO MYSQL
+ public static final String FLATTENED_FILE_COL_NAME = "_event_bin_file";
+ public static final String FLATTENED_POS_COL_NAME = "_event_pos";
+ public static final String FLATTENED_ROW_COL_NAME = "_event_row";
+ public static final String ADDED_SEQ_COL_NAME = "_event_seq";
+
+ // OUTPUT COLUMNS SPECIFIC TO POSTGRES
+ public static final String FLATTENED_LSN_COL_NAME = "_event_lsn";
+ public static final String FLATTENED_XMIN_COL_NAME = "_event_xmin";
+
+ // Other Constants
+ public static final String DELETE_OP = "d";
+
+ // List of meta data columns
+ public static List META_COLUMNS = Collections.unmodifiableList(Arrays.asList(
+ FLATTENED_OP_COL_NAME,
+ UPSTREAM_PROCESSING_TS_COL_NAME,
+ FLATTENED_TS_COL_NAME,
+ FLATTENED_TX_ID_COL_NAME,
+ FLATTENED_LSN_COL_NAME,
+ FLATTENED_XMIN_COL_NAME,
+ FLATTENED_SHARD_NAME
+ ));
+}
+
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/MySqlDebeziumAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/MySqlDebeziumAvroPayload.java
new file mode 100644
index 0000000000000..ea6165d55d3bf
--- /dev/null
+++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/MySqlDebeziumAvroPayload.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.model.debezium;
+
+import org.apache.hudi.common.util.Option;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+
+/**
+ * Provides support for seamlessly applying changes captured via Debezium for MysqlDB.
+ *
+ * Debezium change event types are determined for the op field in the payload
+ *
+ * - For inserts, op=i
+ * - For deletes, op=d
+ * - For updates, op=u
+ * - For snapshort inserts, op=r
+ *
+ * This payload implementation will issue matching insert, delete, updates against the hudi table
+ */
+public class MySqlDebeziumAvroPayload extends AbstractDebeziumAvroPayload {
+
+ private static final Logger LOG = LogManager.getLogger(MySqlDebeziumAvroPayload.class);
+
+ public MySqlDebeziumAvroPayload(GenericRecord record, Comparable orderingVal) {
+ super(record, orderingVal);
+ }
+
+ public MySqlDebeziumAvroPayload(Option record) {
+ super(record);
+ }
+
+ private String extractSeq(IndexedRecord record) {
+ return ((CharSequence) ((GenericRecord) record).get(DebeziumConstants.ADDED_SEQ_COL_NAME)).toString();
+ }
+
+ @Override
+ protected boolean shouldPickCurrentRecord(IndexedRecord currentRecord, IndexedRecord insertRecord, Schema schema) throws IOException {
+ String currentSourceSeq = extractSeq(currentRecord);
+ String insertSourceSeq = extractSeq(insertRecord);
+ // Pick the current value in storage only if its Seq (file+pos) is latest
+ // compared to the Seq (file+pos) of the insert value
+ return insertSourceSeq.compareTo(currentSourceSeq) < 0;
+ }
+}
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java
new file mode 100644
index 0000000000000..448627d97cbf7
--- /dev/null
+++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.model.debezium;
+
+import org.apache.hudi.common.util.Option;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+/**
+ * Provides support for seamlessly applying changes captured via Debezium for PostgresDB.
+ *
+ * Debezium change event types are determined for the op field in the payload
+ *
+ * - For inserts, op=i
+ * - For deletes, op=d
+ * - For updates, op=u
+ * - For snapshort inserts, op=r
+ *
+ * This payload implementation will issue matching insert, delete, updates against the hudi table
+ */
+public class PostgresDebeziumAvroPayload extends AbstractDebeziumAvroPayload {
+
+ private static final Logger LOG = LogManager.getLogger(PostgresDebeziumAvroPayload.class);
+ public static final String DEBEZIUM_TOASTED_VALUE = "__debezium_unavailable_value";
+
+ public PostgresDebeziumAvroPayload(GenericRecord record, Comparable orderingVal) {
+ super(record, orderingVal);
+ }
+
+ public PostgresDebeziumAvroPayload(Option record) {
+ super(record);
+ }
+
+ private Long extractLSN(IndexedRecord record) {
+ GenericRecord genericRecord = (GenericRecord) record;
+ return (Long) genericRecord.get(DebeziumConstants.FLATTENED_LSN_COL_NAME);
+ }
+
+ @Override
+ protected boolean shouldPickCurrentRecord(IndexedRecord currentRecord, IndexedRecord insertRecord, Schema schema) throws IOException {
+ Long currentSourceLSN = extractLSN(currentRecord);
+ Long insertSourceLSN = extractLSN(insertRecord);
+
+ // Pick the current value in storage only if its LSN is latest compared to the LSN of the insert value
+ return insertSourceLSN < currentSourceLSN;
+ }
+
+ @Override
+ public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException {
+ // Specific to Postgres: If the updated record has TOASTED columns,
+ // we will need to keep the previous value for those columns
+ // see https://debezium.io/documentation/reference/connectors/postgresql.html#postgresql-toasted-values
+ Option insertOrDeleteRecord = super.combineAndGetUpdateValue(currentValue, schema);
+
+ if (insertOrDeleteRecord.isPresent()) {
+ mergeToastedValuesIfPresent(insertOrDeleteRecord.get(), currentValue);
+ }
+ return insertOrDeleteRecord;
+ }
+
+ private void mergeToastedValuesIfPresent(IndexedRecord incomingRecord, IndexedRecord currentRecord) {
+ List fields = incomingRecord.getSchema().getFields();
+
+ fields.forEach(field -> {
+ // There are only four avro data types that have unconstrained sizes, which are
+ // NON-NULLABLE STRING, NULLABLE STRING, NON-NULLABLE BYTES, NULLABLE BYTES
+ if (((GenericData.Record) incomingRecord).get(field.name()) != null
+ && (containsStringToastedValues(incomingRecord, field) || containsBytesToastedValues(incomingRecord, field))) {
+ ((GenericData.Record) incomingRecord).put(field.name(), ((GenericData.Record) currentRecord).get(field.name()));
+ }
+ });
+ }
+
+ /**
+ * Returns true if a column is either of type string or a union of one or more strings that contain a debezium toasted value.
+ *
+ * @param incomingRecord The incoming avro record
+ * @param field the column of interest
+ * @return
+ */
+ private boolean containsStringToastedValues(IndexedRecord incomingRecord, Schema.Field field) {
+ return ((field.schema().getType() == Schema.Type.STRING
+ || (field.schema().getType() == Schema.Type.UNION && field.schema().getTypes().stream().anyMatch(s -> s.getType() == Schema.Type.STRING)))
+ // Check length first as an optimization
+ && ((CharSequence) ((GenericData.Record) incomingRecord).get(field.name())).length() == DEBEZIUM_TOASTED_VALUE.length()
+ && DEBEZIUM_TOASTED_VALUE.equals(((CharSequence) ((GenericData.Record) incomingRecord).get(field.name())).toString()));
+ }
+
+ /**
+ * Returns true if a column is either of type bytes or a union of one or more bytes that contain a debezium toasted value.
+ *
+ * @param incomingRecord The incoming avro record
+ * @param field the column of interest
+ * @return
+ */
+ private boolean containsBytesToastedValues(IndexedRecord incomingRecord, Schema.Field field) {
+ return ((field.schema().getType() == Schema.Type.BYTES
+ || (field.schema().getType() == Schema.Type.UNION && field.schema().getTypes().stream().anyMatch(s -> s.getType() == Schema.Type.BYTES)))
+ // Check length first as an optimization
+ && ((ByteBuffer) ((GenericData.Record) incomingRecord).get(field.name())).array().length == DEBEZIUM_TOASTED_VALUE.length()
+ && DEBEZIUM_TOASTED_VALUE.equals(new String(((ByteBuffer) ((GenericData.Record) incomingRecord).get(field.name())).array(), StandardCharsets.UTF_8)));
+ }
+}
+
diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestMySqlDebeziumAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestMySqlDebeziumAvroPayload.java
new file mode 100644
index 0000000000000..6163c0ac468f5
--- /dev/null
+++ b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestMySqlDebeziumAvroPayload.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or mo contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.model.debezium;
+
+import org.apache.hudi.common.util.Option;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.generic.IndexedRecord;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+
+public class TestMySqlDebeziumAvroPayload {
+
+ private static final String KEY_FIELD_NAME = "Key";
+
+ private Schema avroSchema;
+
+ @BeforeEach
+ void setUp() {
+ this.avroSchema = Schema.createRecord(Arrays.asList(
+ new Schema.Field(KEY_FIELD_NAME, Schema.create(Schema.Type.INT), "", 0),
+ new Schema.Field(DebeziumConstants.FLATTENED_OP_COL_NAME, Schema.create(Schema.Type.STRING), "", null),
+ new Schema.Field(DebeziumConstants.ADDED_SEQ_COL_NAME, Schema.create(Schema.Type.STRING), "", null)
+ ));
+ }
+
+ @Test
+ public void testInsert() throws IOException {
+ GenericRecord insertRecord = createRecord(0, Operation.INSERT, "00001.111");
+ MySqlDebeziumAvroPayload payload = new MySqlDebeziumAvroPayload(insertRecord, "00001.111");
+ validateRecord(payload.getInsertValue(avroSchema), 0, Operation.INSERT, "00001.111");
+ }
+
+ @Test
+ public void testPreCombine() {
+ GenericRecord insertRecord = createRecord(0, Operation.INSERT, "00002.111");
+ MySqlDebeziumAvroPayload insertPayload = new MySqlDebeziumAvroPayload(insertRecord, "00002.111");
+
+ GenericRecord updateRecord = createRecord(0, Operation.UPDATE, "00001.111");
+ MySqlDebeziumAvroPayload updatePayload = new MySqlDebeziumAvroPayload(updateRecord, "00001.111");
+
+ GenericRecord deleteRecord = createRecord(0, Operation.DELETE, "00002.11");
+ MySqlDebeziumAvroPayload deletePayload = new MySqlDebeziumAvroPayload(deleteRecord, "00002.11");
+
+ assertEquals(insertPayload, insertPayload.preCombine(updatePayload));
+ assertEquals(deletePayload, deletePayload.preCombine(updatePayload));
+ assertEquals(insertPayload, deletePayload.preCombine(insertPayload));
+ }
+
+ @Test
+ public void testMergeWithUpdate() throws IOException {
+ GenericRecord updateRecord = createRecord(1, Operation.UPDATE, "00002.11");
+ MySqlDebeziumAvroPayload payload = new MySqlDebeziumAvroPayload(updateRecord, "00002.11");
+
+ GenericRecord existingRecord = createRecord(1, Operation.INSERT, "00001.111");
+ Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema);
+ validateRecord(mergedRecord, 1, Operation.UPDATE, "00002.11");
+
+ GenericRecord lateRecord = createRecord(1, Operation.UPDATE, "00000.222");
+ payload = new MySqlDebeziumAvroPayload(lateRecord, "00000.222");
+ mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema);
+ validateRecord(mergedRecord, 1, Operation.INSERT, "00001.111");
+ }
+
+ @Test
+ public void testMergeWithDelete() throws IOException {
+ GenericRecord deleteRecord = createRecord(2, Operation.DELETE, "00002.11");
+ MySqlDebeziumAvroPayload payload = new MySqlDebeziumAvroPayload(deleteRecord, "00002.11");
+
+ GenericRecord existingRecord = createRecord(2, Operation.UPDATE, "00001.111");
+ Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema);
+ // expect nothing to be committed to table
+ assertFalse(mergedRecord.isPresent());
+
+ GenericRecord lateRecord = createRecord(2, Operation.DELETE, "00000.222");
+ payload = new MySqlDebeziumAvroPayload(lateRecord, "00000.222");
+ mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema);
+ validateRecord(mergedRecord, 2, Operation.UPDATE, "00001.111");
+ }
+
+ private GenericRecord createRecord(int primaryKeyValue, Operation op, String seqValue) {
+ GenericRecord record = new GenericData.Record(avroSchema);
+ record.put(KEY_FIELD_NAME, primaryKeyValue);
+ record.put(DebeziumConstants.FLATTENED_OP_COL_NAME, op.op);
+ record.put(DebeziumConstants.ADDED_SEQ_COL_NAME, seqValue);
+ return record;
+ }
+
+ private void validateRecord(Option iRecord, int primaryKeyValue, Operation op, String seqValue) {
+ IndexedRecord record = iRecord.get();
+ assertEquals(primaryKeyValue, (int) record.get(0));
+ assertEquals(op.op, record.get(1).toString());
+ assertEquals(seqValue, record.get(2).toString());
+ }
+
+ private enum Operation {
+ INSERT("c"),
+ UPDATE("u"),
+ DELETE("d");
+
+ public final String op;
+
+ Operation(String op) {
+ this.op = op;
+ }
+ }
+}
diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestPostgresDebeziumAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestPostgresDebeziumAvroPayload.java
new file mode 100644
index 0000000000000..07512b1c36eaa
--- /dev/null
+++ b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestPostgresDebeziumAvroPayload.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or mo contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.common.model.debezium;
+
+import org.apache.hudi.common.util.Option;
+
+import org.apache.avro.Schema;
+import org.apache.avro.SchemaBuilder;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.avro.util.Utf8;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+public class TestPostgresDebeziumAvroPayload {
+
+ private static final String KEY_FIELD_NAME = "Key";
+ private Schema avroSchema;
+
+ @BeforeEach
+ void setUp() {
+ this.avroSchema = Schema.createRecord(Arrays.asList(
+ new Schema.Field(KEY_FIELD_NAME, Schema.create(Schema.Type.INT), "", 0),
+ new Schema.Field(DebeziumConstants.FLATTENED_OP_COL_NAME, Schema.create(Schema.Type.STRING), "", null),
+ new Schema.Field(DebeziumConstants.FLATTENED_LSN_COL_NAME, Schema.create(Schema.Type.LONG), "", null)
+ ));
+ }
+
+ @Test
+ public void testInsert() throws IOException {
+ GenericRecord insertRecord = createRecord(0, Operation.INSERT, 100L);
+ PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(insertRecord, 100L);
+ validateRecord(payload.getInsertValue(avroSchema), 0, Operation.INSERT, 100L);
+ }
+
+ @Test
+ public void testPreCombine() {
+ GenericRecord insertRecord = createRecord(0, Operation.INSERT, 120L);
+ PostgresDebeziumAvroPayload insertPayload = new PostgresDebeziumAvroPayload(insertRecord, 120L);
+
+ GenericRecord updateRecord = createRecord(0, Operation.UPDATE, 99L);
+ PostgresDebeziumAvroPayload updatePayload = new PostgresDebeziumAvroPayload(updateRecord, 99L);
+
+ GenericRecord deleteRecord = createRecord(0, Operation.DELETE, 111L);
+ PostgresDebeziumAvroPayload deletePayload = new PostgresDebeziumAvroPayload(deleteRecord, 111L);
+
+ assertEquals(insertPayload, insertPayload.preCombine(updatePayload));
+ assertEquals(deletePayload, deletePayload.preCombine(updatePayload));
+ assertEquals(insertPayload, deletePayload.preCombine(insertPayload));
+ }
+
+ @Test
+ public void testMergeWithUpdate() throws IOException {
+ GenericRecord updateRecord = createRecord(1, Operation.UPDATE, 100L);
+ PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(updateRecord, 100L);
+
+ GenericRecord existingRecord = createRecord(1, Operation.INSERT, 99L);
+ Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema);
+ validateRecord(mergedRecord, 1, Operation.UPDATE, 100L);
+
+ GenericRecord lateRecord = createRecord(1, Operation.UPDATE, 98L);
+ payload = new PostgresDebeziumAvroPayload(lateRecord, 98L);
+ mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema);
+ validateRecord(mergedRecord, 1, Operation.INSERT, 99L);
+ }
+
+ @Test
+ public void testMergeWithDelete() throws IOException {
+ GenericRecord deleteRecord = createRecord(2, Operation.DELETE, 100L);
+ PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(deleteRecord, 100L);
+
+ GenericRecord existingRecord = createRecord(2, Operation.UPDATE, 99L);
+ Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema);
+ // expect nothing to be committed to table
+ assertFalse(mergedRecord.isPresent());
+
+ GenericRecord lateRecord = createRecord(2, Operation.DELETE, 98L);
+ payload = new PostgresDebeziumAvroPayload(lateRecord, 98L);
+ mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema);
+ validateRecord(mergedRecord, 2, Operation.UPDATE, 99L);
+ }
+
+ @Test
+ public void testMergeWithToastedValues() throws IOException {
+ Schema avroSchema = SchemaBuilder.builder()
+ .record("test_schema")
+ .namespace("test_namespace")
+ .fields()
+ .name(DebeziumConstants.FLATTENED_LSN_COL_NAME).type().longType().noDefault()
+ .name("string_col").type().stringType().noDefault()
+ .name("byte_col").type().bytesType().noDefault()
+ .name("string_null_col_1").type().nullable().stringType().noDefault()
+ .name("byte_null_col_1").type().nullable().bytesType().noDefault()
+ .name("string_null_col_2").type().nullable().stringType().noDefault()
+ .name("byte_null_col_2").type().nullable().bytesType().noDefault()
+ .endRecord();
+
+ GenericRecord oldVal = new GenericData.Record(avroSchema);
+ oldVal.put(DebeziumConstants.FLATTENED_LSN_COL_NAME, 100L);
+ oldVal.put("string_col", "valid string value");
+ oldVal.put("byte_col", ByteBuffer.wrap("valid byte value".getBytes()));
+ oldVal.put("string_null_col_1", "valid string value");
+ oldVal.put("byte_null_col_1", ByteBuffer.wrap("valid byte value".getBytes()));
+ oldVal.put("string_null_col_2", null);
+ oldVal.put("byte_null_col_2", null);
+
+ GenericRecord newVal = new GenericData.Record(avroSchema);
+ newVal.put(DebeziumConstants.FLATTENED_LSN_COL_NAME, 105L);
+ newVal.put("string_col", PostgresDebeziumAvroPayload.DEBEZIUM_TOASTED_VALUE);
+ newVal.put("byte_col", ByteBuffer.wrap(PostgresDebeziumAvroPayload.DEBEZIUM_TOASTED_VALUE.getBytes()));
+ newVal.put("string_null_col_1", null);
+ newVal.put("byte_null_col_1", null);
+ newVal.put("string_null_col_2", "valid string value");
+ newVal.put("byte_null_col_2", ByteBuffer.wrap("valid byte value".getBytes()));
+
+ PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(Option.of(newVal));
+
+ GenericRecord outputRecord = (GenericRecord) payload
+ .combineAndGetUpdateValue(oldVal, avroSchema).get();
+
+ assertEquals("valid string value", outputRecord.get("string_col"));
+ assertEquals("valid byte value", new String(((ByteBuffer) outputRecord.get("byte_col")).array(), StandardCharsets.UTF_8));
+ assertNull(outputRecord.get("string_null_col_1"));
+ assertNull(outputRecord.get("byte_null_col_1"));
+ assertEquals("valid string value", ((Utf8) outputRecord.get("string_null_col_2")).toString());
+ assertEquals("valid byte value", new String(((ByteBuffer) outputRecord.get("byte_null_col_2")).array(), StandardCharsets.UTF_8));
+ }
+
+ private GenericRecord createRecord(int primaryKeyValue, Operation op, long lsnValue) {
+ GenericRecord record = new GenericData.Record(avroSchema);
+ record.put(KEY_FIELD_NAME, primaryKeyValue);
+ record.put(DebeziumConstants.FLATTENED_OP_COL_NAME, op.op);
+ record.put(DebeziumConstants.FLATTENED_LSN_COL_NAME, lsnValue);
+ return record;
+ }
+
+ private void validateRecord(Option iRecord, int primaryKeyValue, Operation op, long lsnValue) {
+ IndexedRecord record = iRecord.get();
+ assertEquals(primaryKeyValue, (int) record.get(0));
+ assertEquals(op.op, record.get(1).toString());
+ assertEquals(lsnValue, (long) record.get(2));
+ }
+
+ private enum Operation {
+ INSERT("c"),
+ UPDATE("u"),
+ DELETE("d");
+
+ public final String op;
+
+ Operation(String op) {
+ this.op = op;
+ }
+ }
+}
diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java
index 32ef60967ae8a..216369296ad53 100644
--- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java
+++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java
@@ -49,7 +49,7 @@ public class SchemaRegistryProvider extends SchemaProvider {
*/
public static class Config {
- private static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url";
+ public static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url";
private static final String TARGET_SCHEMA_REGISTRY_URL_PROP =
"hoodie.deltastreamer.schemaprovider.registry.targetUrl";
}
diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/DebeziumSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/DebeziumSource.java
new file mode 100644
index 0000000000000..7018419c2d6de
--- /dev/null
+++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/DebeziumSource.java
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.sources.debezium;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.apache.avro.Schema;
+import org.apache.avro.Schema.Field;
+import org.apache.avro.Schema.Type;
+import org.apache.avro.generic.GenericRecord;
+
+import org.apache.hudi.AvroConversionUtils;
+import org.apache.hudi.DataSourceWriteOptions;
+import org.apache.hudi.common.config.TypedProperties;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics;
+import org.apache.hudi.utilities.schema.SchemaProvider;
+import org.apache.hudi.utilities.schema.SchemaRegistryProvider;
+import org.apache.hudi.utilities.sources.RowSource;
+import org.apache.hudi.utilities.sources.helpers.AvroConvertor;
+import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen;
+import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
+
+import org.apache.kafka.common.serialization.StringDeserializer;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.functions;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.streaming.kafka010.KafkaUtils;
+import org.apache.spark.streaming.kafka010.LocationStrategies;
+import org.apache.spark.streaming.kafka010.OffsetRange;
+
+/**
+ * Base class for Debezium streaming source which expects change events as Kafka Avro records.
+ * Obtains the schema from the confluent schema-registry.
+ */
+public abstract class DebeziumSource extends RowSource {
+
+ private static final Logger LOG = LogManager.getLogger(DebeziumSource.class);
+ // these are native kafka's config. do not change the config names.
+ private static final String NATIVE_KAFKA_KEY_DESERIALIZER_PROP = "key.deserializer";
+ private static final String NATIVE_KAFKA_VALUE_DESERIALIZER_PROP = "value.deserializer";
+ private static final String OVERRIDE_CHECKPOINT_STRING = "hoodie.debezium.override.initial.checkpoint.key";
+ private static final String CONNECT_NAME_KEY = "connect.name";
+ private static final String DATE_CONNECT_NAME = "custom.debezium.DateString";
+
+ private final KafkaOffsetGen offsetGen;
+ private final HoodieDeltaStreamerMetrics metrics;
+ private final SchemaRegistryProvider schemaRegistryProvider;
+ private final String deserializerClassName;
+
+ public DebeziumSource(TypedProperties props, JavaSparkContext sparkContext,
+ SparkSession sparkSession,
+ SchemaProvider schemaProvider,
+ HoodieDeltaStreamerMetrics metrics) {
+ super(props, sparkContext, sparkSession, schemaProvider);
+
+ props.put(NATIVE_KAFKA_KEY_DESERIALIZER_PROP, StringDeserializer.class);
+ deserializerClassName = props.getString(DataSourceWriteOptions.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS().key(),
+ DataSourceWriteOptions.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS().defaultValue());
+
+ try {
+ props.put(NATIVE_KAFKA_VALUE_DESERIALIZER_PROP, Class.forName(deserializerClassName));
+ } catch (ClassNotFoundException e) {
+ String error = "Could not load custom avro kafka deserializer: " + deserializerClassName;
+ LOG.error(error);
+ throw new HoodieException(error, e);
+ }
+
+ // Currently, debezium source requires Confluent/Kafka schema-registry to fetch the latest schema.
+ if (schemaProvider == null || !(schemaProvider instanceof SchemaRegistryProvider)) {
+ schemaRegistryProvider = new SchemaRegistryProvider(props, sparkContext);
+ } else {
+ schemaRegistryProvider = (SchemaRegistryProvider) schemaProvider;
+ }
+
+ offsetGen = new KafkaOffsetGen(props);
+ this.metrics = metrics;
+ }
+
+ @Override
+ protected Pair