diff --git a/hudi-utilities/pom.xml b/hudi-utilities/pom.xml
index c8a58d5a588b9..5ef04650a6c99 100644
--- a/hudi-utilities/pom.xml
+++ b/hudi-utilities/pom.xml
@@ -402,6 +402,14 @@
test
+
+
+
+ com.amazonaws
+ aws-java-sdk-sqs
+ ${aws.sdk.version}
+
+
${hive.groupid}
diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java
index dd841f4276042..a217e6b7a8009 100644
--- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java
+++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/sources/HoodieIncrSource.java
@@ -42,45 +42,51 @@ public class HoodieIncrSource extends RowSource {
private static final Logger LOG = LogManager.getLogger(HoodieIncrSource.class);
- protected static class Config {
+ static class Config {
/**
* {@value #HOODIE_SRC_BASE_PATH} is the base-path for the source Hoodie table.
*/
- private static final String HOODIE_SRC_BASE_PATH = "hoodie.deltastreamer.source.hoodieincr.path";
+ static final String HOODIE_SRC_BASE_PATH = "hoodie.deltastreamer.source.hoodieincr.path";
/**
* {@value #NUM_INSTANTS_PER_FETCH} allows the max number of instants whose changes can be incrementally fetched.
*/
- private static final String NUM_INSTANTS_PER_FETCH = "hoodie.deltastreamer.source.hoodieincr.num_instants";
- private static final Integer DEFAULT_NUM_INSTANTS_PER_FETCH = 1;
+ static final String NUM_INSTANTS_PER_FETCH = "hoodie.deltastreamer.source.hoodieincr.num_instants";
+ static final Integer DEFAULT_NUM_INSTANTS_PER_FETCH = 1;
/**
* {@value #HOODIE_SRC_PARTITION_FIELDS} specifies partition fields that needs to be added to source table after
* parsing _hoodie_partition_path.
*/
- private static final String HOODIE_SRC_PARTITION_FIELDS = "hoodie.deltastreamer.source.hoodieincr.partition.fields";
+ static final String HOODIE_SRC_PARTITION_FIELDS = "hoodie.deltastreamer.source.hoodieincr.partition.fields";
/**
* {@value #HOODIE_SRC_PARTITION_EXTRACTORCLASS} PartitionValueExtractor class to extract partition fields from
* _hoodie_partition_path.
*/
- private static final String HOODIE_SRC_PARTITION_EXTRACTORCLASS =
+ static final String HOODIE_SRC_PARTITION_EXTRACTORCLASS =
"hoodie.deltastreamer.source.hoodieincr.partition.extractor.class";
- private static final String DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS =
+ static final String DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS =
SlashEncodedDayPartitionValueExtractor.class.getCanonicalName();
/**
* {@value #READ_LATEST_INSTANT_ON_MISSING_CKPT} allows delta-streamer to incrementally fetch from latest committed
* instant when checkpoint is not provided.
*/
- private static final String READ_LATEST_INSTANT_ON_MISSING_CKPT =
+ static final String READ_LATEST_INSTANT_ON_MISSING_CKPT =
"hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt";
- private static final Boolean DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT = false;
+ static final Boolean DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT = false;
+
+ /**
+ * {@value #SOURCE_FILE_FORMAT} is passed to the reader while loading dataset. Default value is parquet.
+ */
+ static final String SOURCE_FILE_FORMAT = "hoodie.deltastreamer.source.hoodieincr.file.format";
+ static final String DEFAULT_SOURCE_FILE_FORMAT = "parquet";
}
public HoodieIncrSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
- SchemaProvider schemaProvider) {
+ SchemaProvider schemaProvider) {
super(props, sparkContext, sparkSession, schemaProvider);
}
@@ -123,10 +129,10 @@ public Pair