diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/AbstractDebeziumAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/AbstractDebeziumAvroPayload.java new file mode 100644 index 0000000000000..33f1d9f0025b2 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/AbstractDebeziumAvroPayload.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; + +/** + * Base class that provides support for seamlessly applying changes captured via Debezium. + *

+ * Debezium change event types are determined for the op field in the payload + *

+ * - For inserts, op=i + * - For deletes, op=d + * - For updates, op=u + * - For snapshort inserts, op=r + *

+ * This payload implementation will issue matching insert, delete, updates against the hudi table + */ +public abstract class AbstractDebeziumAvroPayload extends OverwriteWithLatestAvroPayload { + + private static final Logger LOG = LogManager.getLogger(AbstractDebeziumAvroPayload.class); + + public AbstractDebeziumAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public AbstractDebeziumAvroPayload(Option record) { + super(record); + } + + @Override + public Option getInsertValue(Schema schema) throws IOException { + IndexedRecord insertRecord = getInsertRecord(schema); + return handleDeleteOperation(insertRecord); + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { + // Step 1: If the time occurrence of the current record in storage is higher than the time occurrence of the + // insert record (including a delete record), pick the current record. + if (shouldPickCurrentRecord(currentValue, getInsertRecord(schema), schema)) { + return Option.of(currentValue); + } + // Step 2: Pick the insert record (as a delete record if its a deleted event) + return getInsertValue(schema); + } + + protected abstract boolean shouldPickCurrentRecord(IndexedRecord currentRecord, IndexedRecord insertRecord, Schema schema) throws IOException; + + private Option handleDeleteOperation(IndexedRecord insertRecord) { + boolean delete = false; + if (insertRecord instanceof GenericRecord) { + GenericRecord record = (GenericRecord) insertRecord; + Object value = record.get(DebeziumConstants.FLATTENED_OP_COL_NAME); + delete = value != null && value.toString().equalsIgnoreCase(DebeziumConstants.DELETE_OP); + } + + return delete ? Option.empty() : Option.of(insertRecord); + } + + private IndexedRecord getInsertRecord(Schema schema) throws IOException { + return super.getInsertValue(schema).get(); + } +} \ No newline at end of file diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/DebeziumConstants.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/DebeziumConstants.java new file mode 100644 index 0000000000000..d3e115e3d9d8a --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/DebeziumConstants.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Constants used by {@link DebeziumSource} and {@link DebeziumAvroPayload}. + */ +public class DebeziumConstants { + + // INPUT COLUMNS + public static final String INCOMING_BEFORE_FIELD = "before"; + public static final String INCOMING_AFTER_FIELD = "after"; + public static final String INCOMING_SOURCE_FIELD = "source"; + public static final String INCOMING_OP_FIELD = "op"; + public static final String INCOMING_TS_MS_FIELD = "ts_ms"; + + public static final String INCOMING_SOURCE_NAME_FIELD = "source.name"; + public static final String INCOMING_SOURCE_TS_MS_FIELD = "source.ts_ms"; + public static final String INCOMING_SOURCE_TXID_FIELD = "source.txId"; + + // INPUT COLUMNS SPECIFIC TO MYSQL + public static final String INCOMING_SOURCE_FILE_FIELD = "source.file"; + public static final String INCOMING_SOURCE_POS_FIELD = "source.pos"; + public static final String INCOMING_SOURCE_ROW_FIELD = "source.row"; + + // INPUT COLUMNS SPECIFIC TO POSTGRES + public static final String INCOMING_SOURCE_LSN_FIELD = "source.lsn"; + public static final String INCOMING_SOURCE_XMIN_FIELD = "source.xmin"; + + // OUTPUT COLUMNS + public static final String FLATTENED_OP_COL_NAME = "_change_operation_type"; + public static final String UPSTREAM_PROCESSING_TS_COL_NAME = "_upstream_event_processed_ts_ms"; + public static final String FLATTENED_SHARD_NAME = "db_shard_source_partition"; + public static final String FLATTENED_TS_COL_NAME = "_event_origin_ts_ms"; + public static final String FLATTENED_TX_ID_COL_NAME = "_event_tx_id"; + + // OUTPUT COLUMNS SPECIFIC TO MYSQL + public static final String FLATTENED_FILE_COL_NAME = "_event_bin_file"; + public static final String FLATTENED_POS_COL_NAME = "_event_pos"; + public static final String FLATTENED_ROW_COL_NAME = "_event_row"; + public static final String ADDED_SEQ_COL_NAME = "_event_seq"; + + // OUTPUT COLUMNS SPECIFIC TO POSTGRES + public static final String FLATTENED_LSN_COL_NAME = "_event_lsn"; + public static final String FLATTENED_XMIN_COL_NAME = "_event_xmin"; + + // Other Constants + public static final String DELETE_OP = "d"; + + // List of meta data columns + public static List META_COLUMNS = Collections.unmodifiableList(Arrays.asList( + FLATTENED_OP_COL_NAME, + UPSTREAM_PROCESSING_TS_COL_NAME, + FLATTENED_TS_COL_NAME, + FLATTENED_TX_ID_COL_NAME, + FLATTENED_LSN_COL_NAME, + FLATTENED_XMIN_COL_NAME, + FLATTENED_SHARD_NAME + )); +} + diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/MySqlDebeziumAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/MySqlDebeziumAvroPayload.java new file mode 100644 index 0000000000000..ea6165d55d3bf --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/MySqlDebeziumAvroPayload.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; + +/** + * Provides support for seamlessly applying changes captured via Debezium for MysqlDB. + *

+ * Debezium change event types are determined for the op field in the payload + *

+ * - For inserts, op=i + * - For deletes, op=d + * - For updates, op=u + * - For snapshort inserts, op=r + *

+ * This payload implementation will issue matching insert, delete, updates against the hudi table + */ +public class MySqlDebeziumAvroPayload extends AbstractDebeziumAvroPayload { + + private static final Logger LOG = LogManager.getLogger(MySqlDebeziumAvroPayload.class); + + public MySqlDebeziumAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public MySqlDebeziumAvroPayload(Option record) { + super(record); + } + + private String extractSeq(IndexedRecord record) { + return ((CharSequence) ((GenericRecord) record).get(DebeziumConstants.ADDED_SEQ_COL_NAME)).toString(); + } + + @Override + protected boolean shouldPickCurrentRecord(IndexedRecord currentRecord, IndexedRecord insertRecord, Schema schema) throws IOException { + String currentSourceSeq = extractSeq(currentRecord); + String insertSourceSeq = extractSeq(insertRecord); + // Pick the current value in storage only if its Seq (file+pos) is latest + // compared to the Seq (file+pos) of the insert value + return insertSourceSeq.compareTo(currentSourceSeq) < 0; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java new file mode 100644 index 0000000000000..448627d97cbf7 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/model/debezium/PostgresDebeziumAvroPayload.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.List; + +/** + * Provides support for seamlessly applying changes captured via Debezium for PostgresDB. + *

+ * Debezium change event types are determined for the op field in the payload + *

+ * - For inserts, op=i + * - For deletes, op=d + * - For updates, op=u + * - For snapshort inserts, op=r + *

+ * This payload implementation will issue matching insert, delete, updates against the hudi table + */ +public class PostgresDebeziumAvroPayload extends AbstractDebeziumAvroPayload { + + private static final Logger LOG = LogManager.getLogger(PostgresDebeziumAvroPayload.class); + public static final String DEBEZIUM_TOASTED_VALUE = "__debezium_unavailable_value"; + + public PostgresDebeziumAvroPayload(GenericRecord record, Comparable orderingVal) { + super(record, orderingVal); + } + + public PostgresDebeziumAvroPayload(Option record) { + super(record); + } + + private Long extractLSN(IndexedRecord record) { + GenericRecord genericRecord = (GenericRecord) record; + return (Long) genericRecord.get(DebeziumConstants.FLATTENED_LSN_COL_NAME); + } + + @Override + protected boolean shouldPickCurrentRecord(IndexedRecord currentRecord, IndexedRecord insertRecord, Schema schema) throws IOException { + Long currentSourceLSN = extractLSN(currentRecord); + Long insertSourceLSN = extractLSN(insertRecord); + + // Pick the current value in storage only if its LSN is latest compared to the LSN of the insert value + return insertSourceLSN < currentSourceLSN; + } + + @Override + public Option combineAndGetUpdateValue(IndexedRecord currentValue, Schema schema) throws IOException { + // Specific to Postgres: If the updated record has TOASTED columns, + // we will need to keep the previous value for those columns + // see https://debezium.io/documentation/reference/connectors/postgresql.html#postgresql-toasted-values + Option insertOrDeleteRecord = super.combineAndGetUpdateValue(currentValue, schema); + + if (insertOrDeleteRecord.isPresent()) { + mergeToastedValuesIfPresent(insertOrDeleteRecord.get(), currentValue); + } + return insertOrDeleteRecord; + } + + private void mergeToastedValuesIfPresent(IndexedRecord incomingRecord, IndexedRecord currentRecord) { + List fields = incomingRecord.getSchema().getFields(); + + fields.forEach(field -> { + // There are only four avro data types that have unconstrained sizes, which are + // NON-NULLABLE STRING, NULLABLE STRING, NON-NULLABLE BYTES, NULLABLE BYTES + if (((GenericData.Record) incomingRecord).get(field.name()) != null + && (containsStringToastedValues(incomingRecord, field) || containsBytesToastedValues(incomingRecord, field))) { + ((GenericData.Record) incomingRecord).put(field.name(), ((GenericData.Record) currentRecord).get(field.name())); + } + }); + } + + /** + * Returns true if a column is either of type string or a union of one or more strings that contain a debezium toasted value. + * + * @param incomingRecord The incoming avro record + * @param field the column of interest + * @return + */ + private boolean containsStringToastedValues(IndexedRecord incomingRecord, Schema.Field field) { + return ((field.schema().getType() == Schema.Type.STRING + || (field.schema().getType() == Schema.Type.UNION && field.schema().getTypes().stream().anyMatch(s -> s.getType() == Schema.Type.STRING))) + // Check length first as an optimization + && ((CharSequence) ((GenericData.Record) incomingRecord).get(field.name())).length() == DEBEZIUM_TOASTED_VALUE.length() + && DEBEZIUM_TOASTED_VALUE.equals(((CharSequence) ((GenericData.Record) incomingRecord).get(field.name())).toString())); + } + + /** + * Returns true if a column is either of type bytes or a union of one or more bytes that contain a debezium toasted value. + * + * @param incomingRecord The incoming avro record + * @param field the column of interest + * @return + */ + private boolean containsBytesToastedValues(IndexedRecord incomingRecord, Schema.Field field) { + return ((field.schema().getType() == Schema.Type.BYTES + || (field.schema().getType() == Schema.Type.UNION && field.schema().getTypes().stream().anyMatch(s -> s.getType() == Schema.Type.BYTES))) + // Check length first as an optimization + && ((ByteBuffer) ((GenericData.Record) incomingRecord).get(field.name())).array().length == DEBEZIUM_TOASTED_VALUE.length() + && DEBEZIUM_TOASTED_VALUE.equals(new String(((ByteBuffer) ((GenericData.Record) incomingRecord).get(field.name())).array(), StandardCharsets.UTF_8))); + } +} + diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestMySqlDebeziumAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestMySqlDebeziumAvroPayload.java new file mode 100644 index 0000000000000..6163c0ac468f5 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestMySqlDebeziumAvroPayload.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or mo contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.Arrays; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +public class TestMySqlDebeziumAvroPayload { + + private static final String KEY_FIELD_NAME = "Key"; + + private Schema avroSchema; + + @BeforeEach + void setUp() { + this.avroSchema = Schema.createRecord(Arrays.asList( + new Schema.Field(KEY_FIELD_NAME, Schema.create(Schema.Type.INT), "", 0), + new Schema.Field(DebeziumConstants.FLATTENED_OP_COL_NAME, Schema.create(Schema.Type.STRING), "", null), + new Schema.Field(DebeziumConstants.ADDED_SEQ_COL_NAME, Schema.create(Schema.Type.STRING), "", null) + )); + } + + @Test + public void testInsert() throws IOException { + GenericRecord insertRecord = createRecord(0, Operation.INSERT, "00001.111"); + MySqlDebeziumAvroPayload payload = new MySqlDebeziumAvroPayload(insertRecord, "00001.111"); + validateRecord(payload.getInsertValue(avroSchema), 0, Operation.INSERT, "00001.111"); + } + + @Test + public void testPreCombine() { + GenericRecord insertRecord = createRecord(0, Operation.INSERT, "00002.111"); + MySqlDebeziumAvroPayload insertPayload = new MySqlDebeziumAvroPayload(insertRecord, "00002.111"); + + GenericRecord updateRecord = createRecord(0, Operation.UPDATE, "00001.111"); + MySqlDebeziumAvroPayload updatePayload = new MySqlDebeziumAvroPayload(updateRecord, "00001.111"); + + GenericRecord deleteRecord = createRecord(0, Operation.DELETE, "00002.11"); + MySqlDebeziumAvroPayload deletePayload = new MySqlDebeziumAvroPayload(deleteRecord, "00002.11"); + + assertEquals(insertPayload, insertPayload.preCombine(updatePayload)); + assertEquals(deletePayload, deletePayload.preCombine(updatePayload)); + assertEquals(insertPayload, deletePayload.preCombine(insertPayload)); + } + + @Test + public void testMergeWithUpdate() throws IOException { + GenericRecord updateRecord = createRecord(1, Operation.UPDATE, "00002.11"); + MySqlDebeziumAvroPayload payload = new MySqlDebeziumAvroPayload(updateRecord, "00002.11"); + + GenericRecord existingRecord = createRecord(1, Operation.INSERT, "00001.111"); + Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 1, Operation.UPDATE, "00002.11"); + + GenericRecord lateRecord = createRecord(1, Operation.UPDATE, "00000.222"); + payload = new MySqlDebeziumAvroPayload(lateRecord, "00000.222"); + mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 1, Operation.INSERT, "00001.111"); + } + + @Test + public void testMergeWithDelete() throws IOException { + GenericRecord deleteRecord = createRecord(2, Operation.DELETE, "00002.11"); + MySqlDebeziumAvroPayload payload = new MySqlDebeziumAvroPayload(deleteRecord, "00002.11"); + + GenericRecord existingRecord = createRecord(2, Operation.UPDATE, "00001.111"); + Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + // expect nothing to be committed to table + assertFalse(mergedRecord.isPresent()); + + GenericRecord lateRecord = createRecord(2, Operation.DELETE, "00000.222"); + payload = new MySqlDebeziumAvroPayload(lateRecord, "00000.222"); + mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 2, Operation.UPDATE, "00001.111"); + } + + private GenericRecord createRecord(int primaryKeyValue, Operation op, String seqValue) { + GenericRecord record = new GenericData.Record(avroSchema); + record.put(KEY_FIELD_NAME, primaryKeyValue); + record.put(DebeziumConstants.FLATTENED_OP_COL_NAME, op.op); + record.put(DebeziumConstants.ADDED_SEQ_COL_NAME, seqValue); + return record; + } + + private void validateRecord(Option iRecord, int primaryKeyValue, Operation op, String seqValue) { + IndexedRecord record = iRecord.get(); + assertEquals(primaryKeyValue, (int) record.get(0)); + assertEquals(op.op, record.get(1).toString()); + assertEquals(seqValue, record.get(2).toString()); + } + + private enum Operation { + INSERT("c"), + UPDATE("u"), + DELETE("d"); + + public final String op; + + Operation(String op) { + this.op = op; + } + } +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestPostgresDebeziumAvroPayload.java b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestPostgresDebeziumAvroPayload.java new file mode 100644 index 0000000000000..07512b1c36eaa --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/model/debezium/TestPostgresDebeziumAvroPayload.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or mo contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.common.model.debezium; + +import org.apache.hudi.common.util.Option; + +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.util.Utf8; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; + +public class TestPostgresDebeziumAvroPayload { + + private static final String KEY_FIELD_NAME = "Key"; + private Schema avroSchema; + + @BeforeEach + void setUp() { + this.avroSchema = Schema.createRecord(Arrays.asList( + new Schema.Field(KEY_FIELD_NAME, Schema.create(Schema.Type.INT), "", 0), + new Schema.Field(DebeziumConstants.FLATTENED_OP_COL_NAME, Schema.create(Schema.Type.STRING), "", null), + new Schema.Field(DebeziumConstants.FLATTENED_LSN_COL_NAME, Schema.create(Schema.Type.LONG), "", null) + )); + } + + @Test + public void testInsert() throws IOException { + GenericRecord insertRecord = createRecord(0, Operation.INSERT, 100L); + PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(insertRecord, 100L); + validateRecord(payload.getInsertValue(avroSchema), 0, Operation.INSERT, 100L); + } + + @Test + public void testPreCombine() { + GenericRecord insertRecord = createRecord(0, Operation.INSERT, 120L); + PostgresDebeziumAvroPayload insertPayload = new PostgresDebeziumAvroPayload(insertRecord, 120L); + + GenericRecord updateRecord = createRecord(0, Operation.UPDATE, 99L); + PostgresDebeziumAvroPayload updatePayload = new PostgresDebeziumAvroPayload(updateRecord, 99L); + + GenericRecord deleteRecord = createRecord(0, Operation.DELETE, 111L); + PostgresDebeziumAvroPayload deletePayload = new PostgresDebeziumAvroPayload(deleteRecord, 111L); + + assertEquals(insertPayload, insertPayload.preCombine(updatePayload)); + assertEquals(deletePayload, deletePayload.preCombine(updatePayload)); + assertEquals(insertPayload, deletePayload.preCombine(insertPayload)); + } + + @Test + public void testMergeWithUpdate() throws IOException { + GenericRecord updateRecord = createRecord(1, Operation.UPDATE, 100L); + PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(updateRecord, 100L); + + GenericRecord existingRecord = createRecord(1, Operation.INSERT, 99L); + Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 1, Operation.UPDATE, 100L); + + GenericRecord lateRecord = createRecord(1, Operation.UPDATE, 98L); + payload = new PostgresDebeziumAvroPayload(lateRecord, 98L); + mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 1, Operation.INSERT, 99L); + } + + @Test + public void testMergeWithDelete() throws IOException { + GenericRecord deleteRecord = createRecord(2, Operation.DELETE, 100L); + PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(deleteRecord, 100L); + + GenericRecord existingRecord = createRecord(2, Operation.UPDATE, 99L); + Option mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + // expect nothing to be committed to table + assertFalse(mergedRecord.isPresent()); + + GenericRecord lateRecord = createRecord(2, Operation.DELETE, 98L); + payload = new PostgresDebeziumAvroPayload(lateRecord, 98L); + mergedRecord = payload.combineAndGetUpdateValue(existingRecord, avroSchema); + validateRecord(mergedRecord, 2, Operation.UPDATE, 99L); + } + + @Test + public void testMergeWithToastedValues() throws IOException { + Schema avroSchema = SchemaBuilder.builder() + .record("test_schema") + .namespace("test_namespace") + .fields() + .name(DebeziumConstants.FLATTENED_LSN_COL_NAME).type().longType().noDefault() + .name("string_col").type().stringType().noDefault() + .name("byte_col").type().bytesType().noDefault() + .name("string_null_col_1").type().nullable().stringType().noDefault() + .name("byte_null_col_1").type().nullable().bytesType().noDefault() + .name("string_null_col_2").type().nullable().stringType().noDefault() + .name("byte_null_col_2").type().nullable().bytesType().noDefault() + .endRecord(); + + GenericRecord oldVal = new GenericData.Record(avroSchema); + oldVal.put(DebeziumConstants.FLATTENED_LSN_COL_NAME, 100L); + oldVal.put("string_col", "valid string value"); + oldVal.put("byte_col", ByteBuffer.wrap("valid byte value".getBytes())); + oldVal.put("string_null_col_1", "valid string value"); + oldVal.put("byte_null_col_1", ByteBuffer.wrap("valid byte value".getBytes())); + oldVal.put("string_null_col_2", null); + oldVal.put("byte_null_col_2", null); + + GenericRecord newVal = new GenericData.Record(avroSchema); + newVal.put(DebeziumConstants.FLATTENED_LSN_COL_NAME, 105L); + newVal.put("string_col", PostgresDebeziumAvroPayload.DEBEZIUM_TOASTED_VALUE); + newVal.put("byte_col", ByteBuffer.wrap(PostgresDebeziumAvroPayload.DEBEZIUM_TOASTED_VALUE.getBytes())); + newVal.put("string_null_col_1", null); + newVal.put("byte_null_col_1", null); + newVal.put("string_null_col_2", "valid string value"); + newVal.put("byte_null_col_2", ByteBuffer.wrap("valid byte value".getBytes())); + + PostgresDebeziumAvroPayload payload = new PostgresDebeziumAvroPayload(Option.of(newVal)); + + GenericRecord outputRecord = (GenericRecord) payload + .combineAndGetUpdateValue(oldVal, avroSchema).get(); + + assertEquals("valid string value", outputRecord.get("string_col")); + assertEquals("valid byte value", new String(((ByteBuffer) outputRecord.get("byte_col")).array(), StandardCharsets.UTF_8)); + assertNull(outputRecord.get("string_null_col_1")); + assertNull(outputRecord.get("byte_null_col_1")); + assertEquals("valid string value", ((Utf8) outputRecord.get("string_null_col_2")).toString()); + assertEquals("valid byte value", new String(((ByteBuffer) outputRecord.get("byte_null_col_2")).array(), StandardCharsets.UTF_8)); + } + + private GenericRecord createRecord(int primaryKeyValue, Operation op, long lsnValue) { + GenericRecord record = new GenericData.Record(avroSchema); + record.put(KEY_FIELD_NAME, primaryKeyValue); + record.put(DebeziumConstants.FLATTENED_OP_COL_NAME, op.op); + record.put(DebeziumConstants.FLATTENED_LSN_COL_NAME, lsnValue); + return record; + } + + private void validateRecord(Option iRecord, int primaryKeyValue, Operation op, long lsnValue) { + IndexedRecord record = iRecord.get(); + assertEquals(primaryKeyValue, (int) record.get(0)); + assertEquals(op.op, record.get(1).toString()); + assertEquals(lsnValue, (long) record.get(2)); + } + + private enum Operation { + INSERT("c"), + UPDATE("u"), + DELETE("d"); + + public final String op; + + Operation(String op) { + this.op = op; + } + } +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java index 32ef60967ae8a..216369296ad53 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaRegistryProvider.java @@ -49,7 +49,7 @@ public class SchemaRegistryProvider extends SchemaProvider { */ public static class Config { - private static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url"; + public static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url"; private static final String TARGET_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.targetUrl"; } diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/DebeziumSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/DebeziumSource.java new file mode 100644 index 0000000000000..7018419c2d6de --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/DebeziumSource.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.sources.debezium; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.avro.Schema; +import org.apache.avro.Schema.Field; +import org.apache.avro.Schema.Type; +import org.apache.avro.generic.GenericRecord; + +import org.apache.hudi.AvroConversionUtils; +import org.apache.hudi.DataSourceWriteOptions; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.Pair; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics; +import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.hudi.utilities.schema.SchemaRegistryProvider; +import org.apache.hudi.utilities.sources.RowSource; +import org.apache.hudi.utilities.sources.helpers.AvroConvertor; +import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen; +import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils; + +import org.apache.kafka.common.serialization.StringDeserializer; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.functions; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.streaming.kafka010.KafkaUtils; +import org.apache.spark.streaming.kafka010.LocationStrategies; +import org.apache.spark.streaming.kafka010.OffsetRange; + +/** + * Base class for Debezium streaming source which expects change events as Kafka Avro records. + * Obtains the schema from the confluent schema-registry. + */ +public abstract class DebeziumSource extends RowSource { + + private static final Logger LOG = LogManager.getLogger(DebeziumSource.class); + // these are native kafka's config. do not change the config names. + private static final String NATIVE_KAFKA_KEY_DESERIALIZER_PROP = "key.deserializer"; + private static final String NATIVE_KAFKA_VALUE_DESERIALIZER_PROP = "value.deserializer"; + private static final String OVERRIDE_CHECKPOINT_STRING = "hoodie.debezium.override.initial.checkpoint.key"; + private static final String CONNECT_NAME_KEY = "connect.name"; + private static final String DATE_CONNECT_NAME = "custom.debezium.DateString"; + + private final KafkaOffsetGen offsetGen; + private final HoodieDeltaStreamerMetrics metrics; + private final SchemaRegistryProvider schemaRegistryProvider; + private final String deserializerClassName; + + public DebeziumSource(TypedProperties props, JavaSparkContext sparkContext, + SparkSession sparkSession, + SchemaProvider schemaProvider, + HoodieDeltaStreamerMetrics metrics) { + super(props, sparkContext, sparkSession, schemaProvider); + + props.put(NATIVE_KAFKA_KEY_DESERIALIZER_PROP, StringDeserializer.class); + deserializerClassName = props.getString(DataSourceWriteOptions.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS().key(), + DataSourceWriteOptions.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS().defaultValue()); + + try { + props.put(NATIVE_KAFKA_VALUE_DESERIALIZER_PROP, Class.forName(deserializerClassName)); + } catch (ClassNotFoundException e) { + String error = "Could not load custom avro kafka deserializer: " + deserializerClassName; + LOG.error(error); + throw new HoodieException(error, e); + } + + // Currently, debezium source requires Confluent/Kafka schema-registry to fetch the latest schema. + if (schemaProvider == null || !(schemaProvider instanceof SchemaRegistryProvider)) { + schemaRegistryProvider = new SchemaRegistryProvider(props, sparkContext); + } else { + schemaRegistryProvider = (SchemaRegistryProvider) schemaProvider; + } + + offsetGen = new KafkaOffsetGen(props); + this.metrics = metrics; + } + + @Override + protected Pair>, String> fetchNextBatch(Option lastCkptStr, long sourceLimit) { + String overrideCheckpointStr = props.getString(OVERRIDE_CHECKPOINT_STRING, ""); + + OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCkptStr, sourceLimit, metrics); + long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges); + LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName()); + + if (totalNewMsgs == 0) { + // If there are no new messages, use empty dataframe with no schema. This is because the schema from schema registry can only be considered + // up to date if a change event has occurred. + return Pair.of(Option.of(sparkSession.emptyDataFrame()), overrideCheckpointStr.isEmpty() ? CheckpointUtils.offsetsToStr(offsetRanges) : overrideCheckpointStr); + } else { + try { + String schemaStr = schemaRegistryProvider.fetchSchemaFromRegistry(props.getString(SchemaRegistryProvider.Config.SRC_SCHEMA_REGISTRY_URL_PROP)); + Dataset dataset = toDataset(offsetRanges, offsetGen, schemaStr); + LOG.info(String.format("Spark schema of Kafka Payload for topic %s:\n%s", offsetGen.getTopicName(), dataset.schema().treeString())); + LOG.info(String.format("New checkpoint string: %s", CheckpointUtils.offsetsToStr(offsetRanges))); + return Pair.of(Option.of(dataset), overrideCheckpointStr.isEmpty() ? CheckpointUtils.offsetsToStr(offsetRanges) : overrideCheckpointStr); + } catch (IOException exc) { + LOG.error("Fatal error reading and parsing incoming debezium event", exc); + throw new HoodieException("Fatal error reading and parsing incoming debezium event", exc); + } + } + } + + /** + * Debezium Kafka Payload has a nested structure, flatten it specific to the Database type. + * @param rawKafkaData Dataset of the Debezium CDC event from the kafka + * @return A flattened dataset. + */ + protected abstract Dataset processDataset(Dataset rawKafkaData); + + /** + * Converts a Kafka Topic offset into a Spark dataset. + * + * @param offsetRanges Offset ranges + * @param offsetGen KafkaOffsetGen + * @return Spark dataset + */ + private Dataset toDataset(OffsetRange[] offsetRanges, KafkaOffsetGen offsetGen, String schemaStr) { + AvroConvertor convertor = new AvroConvertor(schemaStr); + Dataset kafkaData; + if (deserializerClassName.equals(StringDeserializer.class.getName())) { + kafkaData = AvroConversionUtils.createDataFrame( + KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()) + .map(obj -> convertor.fromJson(obj.value())) + .rdd(), schemaStr, sparkSession); + } else { + kafkaData = AvroConversionUtils.createDataFrame( + KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent()) + .map(obj -> (GenericRecord) obj.value()) + .rdd(), schemaStr, sparkSession); + } + + // Flatten debezium payload, specific to each DB type (postgres/ mysql/ etc..) + Dataset debeziumDataset = processDataset(kafkaData); + + // Some required transformations to ensure debezium data types are converted to spark supported types. + return convertArrayColumnsToString(convertColumnToNullable(sparkSession, + convertDateColumns(debeziumDataset, new Schema.Parser().parse(schemaStr)))); + } + + /** + * Converts string formatted date columns into Spark date columns. + * + * @param dataset Spark dataset + * @param schema Avro schema from Debezium + * @return Converted dataset + */ + public static Dataset convertDateColumns(Dataset dataset, Schema schema) { + if (schema.getField("before") != null) { + List dateFields = schema.getField("before") + .schema() + .getTypes() + .get(1) + .getFields() + .stream() + .filter(field -> { + if (field.schema().getType() == Type.UNION) { + return field.schema().getTypes().stream().anyMatch( + schemaInUnion -> DATE_CONNECT_NAME.equals(schemaInUnion.getProp(CONNECT_NAME_KEY)) + ); + } else { + return DATE_CONNECT_NAME.equals(field.schema().getProp(CONNECT_NAME_KEY)); + } + }).map(Field::name).collect(Collectors.toList()); + + LOG.info("Date fields: " + dateFields.toString()); + + for (String dateCol : dateFields) { + dataset = dataset.withColumn(dateCol, functions.col(dateCol).cast(DataTypes.DateType)); + } + } + + return dataset; + } + + /** + * Utility function for converting columns to nullable. This is useful when required to make a column nullable to match a nullable column from Debezium change + * events. + * + * @param sparkSession SparkSession object + * @param dataset Dataframe to modify + * @return Modified dataframe + */ + private static Dataset convertColumnToNullable(SparkSession sparkSession, Dataset dataset) { + List columns = Arrays.asList(dataset.columns()); + StructField[] modifiedStructFields = Arrays.stream(dataset.schema().fields()).map(field -> columns + .contains(field.name()) ? new StructField(field.name(), field.dataType(), true, field.metadata()) : field) + .toArray(StructField[]::new); + + return sparkSession.createDataFrame(dataset.rdd(), new StructType(modifiedStructFields)); + } + + /** + * Converts Array types to String types because not all Debezium array columns are supported to be converted + * to Spark array columns. + * + * @param dataset Dataframe to modify + * @return Modified dataframe + */ + private static Dataset convertArrayColumnsToString(Dataset dataset) { + List arrayColumns = Arrays.stream(dataset.schema().fields()) + .filter(field -> field.dataType().typeName().toLowerCase().startsWith("array")) + .map(StructField::name) + .collect(Collectors.toList()); + + for (String colName : arrayColumns) { + dataset = dataset.withColumn(colName, functions.col(colName).cast(DataTypes.StringType)); + } + + return dataset; + } +} + diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/MysqlDebeziumSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/MysqlDebeziumSource.java new file mode 100644 index 0000000000000..cc8f645cbf83d --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/MysqlDebeziumSource.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.sources.debezium; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.debezium.DebeziumConstants; +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics; +import org.apache.hudi.utilities.schema.SchemaProvider; + +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.api.java.UDF2; +import org.apache.spark.sql.types.DataTypes; + +import static org.apache.spark.sql.functions.callUDF; + +/** + * Source for incrementally ingesting debezium generated change logs for Mysql DB. + */ +public class MysqlDebeziumSource extends DebeziumSource { + + private final SQLContext sqlContext; + private final String generateUniqueSeqUdfFn = "mysql_generate_order_key"; + + public MysqlDebeziumSource(TypedProperties props, JavaSparkContext sparkContext, + SparkSession sparkSession, + SchemaProvider schemaProvider, + HoodieDeltaStreamerMetrics metrics) { + super(props, sparkContext, sparkSession, schemaProvider, metrics); + this.sqlContext = sparkSession.sqlContext(); + sqlContext.udf().register(generateUniqueSeqUdfFn, (UDF2) MysqlDebeziumSource::generateUniqueSequence, DataTypes.StringType); + } + + /** + * Debezium Kafka Payload has a nested structure (see https://debezium.io/documentation/reference/1.4/connectors/mysql.html). + * This function flattens this nested structure for the Mysql data, and also extracts a subset of Debezium metadata fields. + * + * @param rowDataset Dataset containing Debezium Payloads + * @return New dataset with flattened columns + */ + @Override + protected Dataset processDataset(Dataset rowDataset) { + Dataset flattenedDataset = rowDataset; + if (rowDataset.columns().length > 0) { + // Only flatten for non-empty schemas + Dataset insertedOrUpdatedData = rowDataset + .selectExpr( + String.format("%s as %s", DebeziumConstants.INCOMING_OP_FIELD, DebeziumConstants.FLATTENED_OP_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_TS_MS_FIELD, DebeziumConstants.UPSTREAM_PROCESSING_TS_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_NAME_FIELD, DebeziumConstants.FLATTENED_SHARD_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TS_MS_FIELD, DebeziumConstants.FLATTENED_TS_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TXID_FIELD, DebeziumConstants.FLATTENED_TX_ID_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_FILE_FIELD, DebeziumConstants.FLATTENED_FILE_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_POS_FIELD, DebeziumConstants.FLATTENED_POS_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_ROW_FIELD, DebeziumConstants.FLATTENED_ROW_COL_NAME), + String.format("%s.*", DebeziumConstants.INCOMING_AFTER_FIELD) + ) + .filter(rowDataset.col(DebeziumConstants.INCOMING_OP_FIELD).notEqual(DebeziumConstants.DELETE_OP)); + + Dataset deletedData = rowDataset + .selectExpr( + String.format("%s as %s", DebeziumConstants.INCOMING_OP_FIELD, DebeziumConstants.FLATTENED_OP_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_TS_MS_FIELD, DebeziumConstants.UPSTREAM_PROCESSING_TS_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_NAME_FIELD, DebeziumConstants.FLATTENED_SHARD_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TS_MS_FIELD, DebeziumConstants.FLATTENED_TS_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TXID_FIELD, DebeziumConstants.FLATTENED_TX_ID_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_FILE_FIELD, DebeziumConstants.FLATTENED_FILE_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_POS_FIELD, DebeziumConstants.FLATTENED_POS_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_ROW_FIELD, DebeziumConstants.FLATTENED_ROW_COL_NAME), + String.format("%s.*", DebeziumConstants.INCOMING_BEFORE_FIELD) + ) + .filter(rowDataset.col(DebeziumConstants.INCOMING_OP_FIELD).equalTo(DebeziumConstants.DELETE_OP)); + + flattenedDataset = insertedOrUpdatedData.union(deletedData); + } + + return flattenedDataset.withColumn(DebeziumConstants.ADDED_SEQ_COL_NAME, + callUDF(generateUniqueSeqUdfFn, flattenedDataset.col(DebeziumConstants.FLATTENED_FILE_COL_NAME), + flattenedDataset.col(DebeziumConstants.FLATTENED_POS_COL_NAME))); + } + + private static String generateUniqueSequence(String fileId, Long pos) { + return fileId.substring(fileId.lastIndexOf('.') + 1).concat("." + pos); + } +} diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/PostgresDebeziumSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/PostgresDebeziumSource.java new file mode 100644 index 0000000000000..bf4381792cfc7 --- /dev/null +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/debezium/PostgresDebeziumSource.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.sources.debezium; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.debezium.DebeziumConstants; +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics; +import org.apache.hudi.utilities.schema.SchemaProvider; + +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +/** + * Source for incrementally ingesting debezium generated change logs for PostgresDB. + */ +public class PostgresDebeziumSource extends DebeziumSource { + + public PostgresDebeziumSource(TypedProperties props, JavaSparkContext sparkContext, + SparkSession sparkSession, + SchemaProvider schemaProvider, + HoodieDeltaStreamerMetrics metrics) { + super(props, sparkContext, sparkSession, schemaProvider, metrics); + } + + /** + * Debezium Kafka Payload has a nested structure (see https://debezium.io/documentation/reference/1.4/connectors/postgresql.html#postgresql-create-events). + * This function flattens this nested structure for the Postgres data, and also extracts a subset of Debezium metadata fields. + * + * @param rowDataset Dataset containing Debezium Payloads + * @return New dataset with flattened columns + */ + @Override + protected Dataset processDataset(Dataset rowDataset) { + if (rowDataset.columns().length > 0) { + // Pick selective debezium and postgres meta fields: pick the row values from before field for delete record + // and row values from after field for insert or update records. + Dataset insertedOrUpdatedData = rowDataset + .selectExpr( + String.format("%s as %s", DebeziumConstants.INCOMING_OP_FIELD, DebeziumConstants.FLATTENED_OP_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_TS_MS_FIELD, DebeziumConstants.UPSTREAM_PROCESSING_TS_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_NAME_FIELD, DebeziumConstants.FLATTENED_SHARD_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TS_MS_FIELD, DebeziumConstants.FLATTENED_TS_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TXID_FIELD, DebeziumConstants.FLATTENED_TX_ID_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_LSN_FIELD, DebeziumConstants.FLATTENED_LSN_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_XMIN_FIELD, DebeziumConstants.FLATTENED_XMIN_COL_NAME), + String.format("%s.*", DebeziumConstants.INCOMING_AFTER_FIELD) + ) + .filter(rowDataset.col(DebeziumConstants.INCOMING_OP_FIELD).notEqual(DebeziumConstants.DELETE_OP)); + + Dataset deletedData = rowDataset + .selectExpr( + String.format("%s as %s", DebeziumConstants.INCOMING_OP_FIELD, DebeziumConstants.FLATTENED_OP_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_TS_MS_FIELD, DebeziumConstants.UPSTREAM_PROCESSING_TS_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_NAME_FIELD, DebeziumConstants.FLATTENED_SHARD_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TS_MS_FIELD, DebeziumConstants.FLATTENED_TS_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TXID_FIELD, DebeziumConstants.FLATTENED_TX_ID_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_LSN_FIELD, DebeziumConstants.FLATTENED_LSN_COL_NAME), + String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_XMIN_FIELD, DebeziumConstants.FLATTENED_XMIN_COL_NAME), + String.format("%s.*", DebeziumConstants.INCOMING_BEFORE_FIELD) + ) + .filter(rowDataset.col(DebeziumConstants.INCOMING_OP_FIELD).equalTo(DebeziumConstants.DELETE_OP)); + + return insertedOrUpdatedData.union(deletedData); + } else { + return rowDataset; + } + } +} + diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestAbstractDebeziumSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestAbstractDebeziumSource.java new file mode 100644 index 0000000000000..f7dc6b9258c16 --- /dev/null +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestAbstractDebeziumSource.java @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.sources.debezium; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.debezium.DebeziumConstants; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.utilities.UtilHelpers; +import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics; +import org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter; +import org.apache.hudi.utilities.schema.SchemaProvider; +import org.apache.hudi.utilities.schema.SchemaRegistryProvider; +import org.apache.hudi.utilities.sources.InputBatch; +import org.apache.hudi.utilities.testutils.UtilitiesTestBase; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.common.serialization.StringDeserializer; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.streaming.kafka010.KafkaTestUtils; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.util.UUID; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.params.provider.Arguments.arguments; +import static org.mockito.Mockito.mock; + +public abstract class TestAbstractDebeziumSource extends UtilitiesTestBase { + + private static final String TEST_TOPIC_NAME = "hoodie_test"; + + private final HoodieDeltaStreamerMetrics metrics = mock(HoodieDeltaStreamerMetrics.class); + private KafkaTestUtils testUtils; + + @BeforeAll + public static void initClass() throws Exception { + UtilitiesTestBase.initClass(false); + } + + @AfterAll + public static void cleanupClass() { + UtilitiesTestBase.cleanupClass(); + } + + @BeforeEach + public void setup() throws Exception { + super.setup(); + testUtils = new KafkaTestUtils(); + testUtils.setup(); + } + + @AfterEach + public void teardown() throws Exception { + super.teardown(); + testUtils.teardown(); + } + + private TypedProperties createPropsForJsonSource() { + TypedProperties props = new TypedProperties(); + props.setProperty("hoodie.deltastreamer.source.kafka.topic", TEST_TOPIC_NAME); + props.setProperty("bootstrap.servers", testUtils.brokerAddress()); + props.setProperty("auto.offset.reset", "earliest"); + props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); + props.setProperty("hoodie.deltastreamer.schemaprovider.registry.url", "localhost"); + props.setProperty("hoodie.deltastreamer.source.kafka.value.deserializer.class", StringDeserializer.class.getName()); + props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, UUID.randomUUID().toString()); + + return props; + } + + protected abstract String getIndexName(); + + protected abstract String getSourceClass(); + + protected abstract String getSchema(); + + protected abstract GenericRecord generateMetaFields(GenericRecord record); + + protected abstract void validateMetaFields(Dataset records); + + @ParameterizedTest + @MethodSource("testArguments") + public void testDebeziumEvents(Operation operation) throws Exception { + + String sourceClass = getSourceClass(); + + // topic setup. + testUtils.createTopic(TEST_TOPIC_NAME, 2); + TypedProperties props = createPropsForJsonSource(); + + SchemaProvider schemaProvider = new MockSchemaRegistryProvider(props, jsc, this); + SourceFormatAdapter debeziumSource = new SourceFormatAdapter(UtilHelpers.createSource(sourceClass, props, jsc, sparkSession, schemaProvider, metrics)); + + testUtils.sendMessages(TEST_TOPIC_NAME, new String[] {generateDebeziumEvent(operation).toString()}); + + InputBatch> fetch = debeziumSource.fetchNewDataInRowFormat(Option.empty(), 10); + assertEquals(1, fetch.getBatch().get().count()); + + // Ensure the before fields are picked for DELETE CDC Events, + // and after fields are picked for INSERT and UPDATE CDC Events. + final String fieldPrefix = (operation.equals(Operation.DELETE)) ? "before_" : "after_"; + assertTrue(fetch.getBatch().get().select("type").collectAsList().stream() + .allMatch(r -> r.getString(0).startsWith(fieldPrefix))); + assertTrue(fetch.getBatch().get().select("type").collectAsList().stream() + .allMatch(r -> r.getString(0).startsWith(fieldPrefix))); + + // Validate DB specific meta fields + validateMetaFields(fetch.getBatch().get()); + } + + private GenericRecord generateDebeziumEvent(Operation op) { + Schema schema = new Schema.Parser().parse(getSchema()); + String indexName = getIndexName().concat(".ghschema.gharchive.Value"); + GenericRecord rec = new GenericData.Record(schema); + rec.put(DebeziumConstants.INCOMING_OP_FIELD, op.op); + rec.put(DebeziumConstants.INCOMING_TS_MS_FIELD, 100L); + + // Before + Schema.Field beforeField = schema.getField(DebeziumConstants.INCOMING_BEFORE_FIELD); + Schema beforeSchema = beforeField.schema().getTypes().get(beforeField.schema().getIndexNamed(indexName)); + GenericRecord beforeRecord = new GenericData.Record(beforeSchema); + + beforeRecord.put("id", 1); + beforeRecord.put("date", "1/1/2020"); + beforeRecord.put("type", "before_type"); + beforeRecord.put("payload", "before_payload"); + beforeRecord.put("timestamp", 1000L); + rec.put(DebeziumConstants.INCOMING_BEFORE_FIELD, beforeRecord); + + // After + Schema.Field afterField = schema.getField(DebeziumConstants.INCOMING_AFTER_FIELD); + Schema afterSchema = afterField.schema().getTypes().get(afterField.schema().getIndexNamed(indexName)); + GenericRecord afterRecord = new GenericData.Record(afterSchema); + + afterRecord.put("id", 1); + afterRecord.put("date", "1/1/2021"); + afterRecord.put("type", "after_type"); + afterRecord.put("payload", "after_payload"); + afterRecord.put("timestamp", 3000L); + rec.put(DebeziumConstants.INCOMING_AFTER_FIELD, afterRecord); + + return generateMetaFields(rec); + } + + private static class MockSchemaRegistryProvider extends SchemaRegistryProvider { + + private final String schema; + + public MockSchemaRegistryProvider(TypedProperties props, JavaSparkContext jssc, TestAbstractDebeziumSource source) { + super(props, jssc); + schema = source.getSchema(); + } + + @Override + public String fetchSchemaFromRegistry(String registryUrl) throws IOException { + return schema; + } + } + + private static Stream testArguments() { + return Stream.of( + arguments(Operation.INSERT), + arguments(Operation.UPDATE), + arguments(Operation.DELETE) + ); + } + + private enum Operation { + INSERT("c"), + UPDATE("u"), + DELETE("d"); + + public final String op; + + Operation(String op) { + this.op = op; + } + } +} diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestMysqlDebeziumSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestMysqlDebeziumSource.java new file mode 100644 index 0000000000000..0886314f03697 --- /dev/null +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestMysqlDebeziumSource.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.sources.debezium; + +import org.apache.hudi.common.model.debezium.DebeziumConstants; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestMysqlDebeziumSource extends TestAbstractDebeziumSource { + + private static final String MYSQL_GITHUB_SCHEMA = "{\"connect.name\": \"mysql.ghschema.gharchive.Envelope\",\n" + + " \"fields\": [{\"default\": null,\"name\": \"before\",\"type\": [\"null\",{\"connect.name\": \"mysql.ghschema.gharchive.Value\",\n" + + " \"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"date\",\"type\": \"string\"},{\"default\": null,\"name\": \"timestamp\",\n" + + " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"type\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"payload\",\n" + + " \"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"org\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"created_at\",\n" + + " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"public\",\"type\": [\"null\",\"boolean\"]}],\"name\": \"Value\",\"type\": \"record\"\n" + + " }]},{\"default\": null,\"name\": \"after\",\"type\": [\"null\",\"Value\"]},{\"name\": \"source\",\"type\": {\"connect.name\": \"io.debezium.connector.mysql.Source\",\n" + + " \"fields\": [{\"name\": \"connector\",\"type\": \"string\"},{\"name\": \"name\",\"type\": \"string\"},{\"name\": \"ts_ms\",\"type\": \"long\"},\n" + + " {\"name\": \"db\",\"type\": \"string\"},{\"name\": \"table\",\"type\": \"string\"},{\"default\": null,\n" + + " \"name\": \"txId\",\"type\": [\"null\",\"long\"]},{\"name\": \"file\",\"type\": \"string\"},{\"default\": null,\"name\": \"pos\",\"type\": [\"null\",\"long\"]},{\"default\": null,\n" + + " \"name\": \"row\",\"type\": [\"null\",\"long\"]}],\"name\": \"Source\",\"namespace\": \"io.debezium.connector.mysql\",\"type\": \"record\"\n" + + " }},{\"name\": \"op\",\"type\": \"string\"},{\"default\": null,\"name\": \"ts_ms\",\"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"transaction\",\n" + + " \"type\": [\"null\",{\"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"total_order\",\"type\": \"long\"},{\"name\": \"data_collection_order\",\n" + + " \"type\": \"long\"}],\"name\": \"ConnectDefault\",\"namespace\": \"io.confluent.connect.avro\",\"type\": \"record\"}]}],\"name\": \"Envelope\",\n" + + " \"namespace\": \"mysql.ghschema.gharchive\",\"type\": \"record\"}"; + + private static final String TEST_DB = "ghschema"; + private static final String TEST_TABLE = "gharchive"; + private static final long TEST_TS_MS = 12345L; + private static final long TEST_TXID = 543L; + private static final String TEST_FILE = "mysql-bin.00007"; + private static final long TEST_POS = 98765L; + private static final String EXPECTED_TEST_SEQ = "00007.98765"; + + @Override + protected String getIndexName() { + return "mysql"; + } + + @Override + protected String getSourceClass() { + return MysqlDebeziumSource.class.getName(); + } + + @Override + protected String getSchema() { + return MYSQL_GITHUB_SCHEMA; + } + + @Override + protected GenericRecord generateMetaFields(GenericRecord rec) { + Schema schema = new Schema.Parser().parse(getSchema()); + // Source fields specific to Mysql DB + GenericRecord sourceRecord = new GenericData.Record(schema.getField(DebeziumConstants.INCOMING_SOURCE_FIELD).schema()); + sourceRecord.put("name", getIndexName()); + sourceRecord.put("connector", getIndexName()); + sourceRecord.put("db", TEST_DB); + sourceRecord.put("table", TEST_TABLE); + sourceRecord.put("ts_ms", TEST_TS_MS); + sourceRecord.put("txId", TEST_TXID); + sourceRecord.put("file", TEST_FILE); + sourceRecord.put("pos", TEST_POS); + rec.put(DebeziumConstants.INCOMING_SOURCE_FIELD, sourceRecord); + return rec; + } + + @Override + protected void validateMetaFields(Dataset records) { + assertTrue(records.select(DebeziumConstants.FLATTENED_SHARD_NAME).collectAsList().stream() + .allMatch(r -> r.getString(0).equals(getIndexName()))); + assertTrue(records.select(DebeziumConstants.FLATTENED_TS_COL_NAME).collectAsList().stream() + .allMatch(r -> r.getLong(0) == TEST_TS_MS)); + assertTrue(records.select(DebeziumConstants.FLATTENED_TX_ID_COL_NAME).collectAsList().stream() + .allMatch(r -> r.getLong(0) == TEST_TXID)); + assertTrue(records.select(DebeziumConstants.ADDED_SEQ_COL_NAME).collectAsList().stream() + .allMatch(r -> r.getString(0).equals(EXPECTED_TEST_SEQ))); + } +} diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestPostgresDebeziumSource.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestPostgresDebeziumSource.java new file mode 100644 index 0000000000000..ef75fc61ff0f1 --- /dev/null +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/sources/debezium/TestPostgresDebeziumSource.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.utilities.sources.debezium; + +import org.apache.hudi.common.model.debezium.DebeziumConstants; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestPostgresDebeziumSource extends TestAbstractDebeziumSource { + + private static final String POSTGRES_GITHUB_SCHEMA = "{\"connect.name\": \"postgres.ghschema.gharchive.Envelope\",\n" + + " \"fields\": [{\"default\": null,\"name\": \"before\",\"type\": [\"null\",{\"connect.name\": \"postgres.ghschema.gharchive.Value\",\n" + + " \"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"date\",\"type\": \"string\"},{\"default\": null,\"name\": \"timestamp\",\n" + + " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"type\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"payload\",\n" + + " \"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"org\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"created_at\",\n" + + " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"public\",\"type\": [\"null\",\"boolean\"]}],\"name\": \"Value\",\"type\": \"record\"\n" + + " }]},{\"default\": null,\"name\": \"after\",\"type\": [\"null\",\"Value\"]},{\"name\": \"source\",\"type\": {\"connect.name\": \"io.debezium.connector.postgresql.Source\",\n" + + " \"fields\": [{\"name\": \"connector\",\"type\": \"string\"},{\"name\": \"name\",\"type\": \"string\"},{\"name\": \"ts_ms\",\"type\": \"long\"},\n" + + " {\"name\": \"db\",\"type\": \"string\"},{\"name\": \"schema\",\"type\": \"string\"},{\"name\": \"table\",\"type\": \"string\"},{\"default\": null,\n" + + " \"name\": \"txId\",\"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"lsn\",\"type\": [\"null\",\"long\"]},{\"default\": null,\n" + + " \"name\": \"xmin\",\"type\": [\"null\",\"long\"]}],\"name\": \"Source\",\"namespace\": \"io.debezium.connector.postgresql\",\"type\": \"record\"\n" + + " }},{\"name\": \"op\",\"type\": \"string\"},{\"default\": null,\"name\": \"ts_ms\",\"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"transaction\",\n" + + " \"type\": [\"null\",{\"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"total_order\",\"type\": \"long\"},{\"name\": \"data_collection_order\",\n" + + " \"type\": \"long\"}],\"name\": \"ConnectDefault\",\"namespace\": \"io.confluent.connect.avro\",\"type\": \"record\"}]}],\"name\": \"Envelope\",\n" + + " \"namespace\": \"postgres.ghschema.gharchive\",\"type\": \"record\"}"; + + private static final String TEST_DB = "postgres"; + private static final String TEST_SCHEMA = "ghschema"; + private static final String TEST_TABLE = "gharchive"; + private static final long TEST_TS_MS = 12345L; + private static final long TEST_TXID = 543L; + private static final long TEST_LSN = 98765L; + + @Override + protected String getIndexName() { + return "postgres"; + } + + @Override + protected String getSourceClass() { + return PostgresDebeziumSource.class.getName(); + } + + @Override + protected String getSchema() { + return POSTGRES_GITHUB_SCHEMA; + } + + @Override + protected GenericRecord generateMetaFields(GenericRecord rec) { + Schema schema = new Schema.Parser().parse(getSchema()); + // Source fields specific to Postgres DB + GenericRecord sourceRecord = new GenericData.Record(schema.getField(DebeziumConstants.INCOMING_SOURCE_FIELD).schema()); + sourceRecord.put("name", getIndexName()); + sourceRecord.put("connector", getIndexName()); + sourceRecord.put("db", TEST_DB); + sourceRecord.put("schema", TEST_SCHEMA); + sourceRecord.put("table", TEST_TABLE); + sourceRecord.put("ts_ms", TEST_TS_MS); + sourceRecord.put("txId", TEST_TXID); + sourceRecord.put("lsn", TEST_LSN); + rec.put(DebeziumConstants.INCOMING_SOURCE_FIELD, sourceRecord); + return rec; + } + + @Override + protected void validateMetaFields(Dataset records) { + assertTrue(records.select(DebeziumConstants.FLATTENED_TS_COL_NAME).collectAsList().stream() + .allMatch(r -> r.getLong(0) == TEST_TS_MS)); + assertTrue(records.select(DebeziumConstants.FLATTENED_TX_ID_COL_NAME).collectAsList().stream() + .allMatch(r -> r.getLong(0) == TEST_TXID)); + assertTrue(records.select(DebeziumConstants.FLATTENED_LSN_COL_NAME).collectAsList().stream() + .allMatch(r -> r.getLong(0) == TEST_LSN)); + } +}