apache · jonvex · Feb 9, 2023 · Feb 9, 2023 · Feb 10, 2023 · Feb 10, 2023
diff --git a/hudi-aws/src/main/java/org/apache/hudi/config/DynamoDbBasedLockConfig.java b/hudi-aws/src/main/java/org/apache/hudi/config/DynamoDbBasedLockConfig.java
@@ -40,10 +40,13 @@
         + " between writers to a Hudi table. Concurrency between Hudi's own table services "
         + " are auto managed internally.")
 public class DynamoDbBasedLockConfig extends HoodieConfig {
+  // Config for DynamoDB based lock provider
 
   // configs for DynamoDb based locks
   public static final String DYNAMODB_BASED_LOCK_PROPERTY_PREFIX = LOCK_PREFIX + "dynamodb.";
 
+  // Ethan: Can we provide a default value here that is less likely to collide,
+  // So this config also becomes optional?
   public static final ConfigProperty<String> DYNAMODB_LOCK_TABLE_NAME = ConfigProperty
       .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "table")
       .noDefaultValue()
@@ -78,6 +81,7 @@ public class DynamoDbBasedLockConfig extends HoodieConfig {
       .withDocumentation("For DynamoDB based lock provider, the region used in endpoint for Amazon DynamoDB service."
                          + " Would try to first get it from AWS_REGION environment variable. If not find, by default use us-east-1");
 
+  // Ethan: Need to confirm these are good defaults for cost
   public static final ConfigProperty<String> DYNAMODB_LOCK_BILLING_MODE = ConfigProperty
       .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "billing_mode")
       .defaultValue(BillingMode.PAY_PER_REQUEST.name())
@@ -96,16 +100,18 @@ public class DynamoDbBasedLockConfig extends HoodieConfig {
       .sinceVersion("0.10.0")
       .withDocumentation("For DynamoDB based lock provider, write capacity units when using PROVISIONED billing mode");
 
+  // Ethan: 10 min.  Is this timeout too long?
   public static final ConfigProperty<String> DYNAMODB_LOCK_TABLE_CREATION_TIMEOUT = ConfigProperty
       .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "table_creation_timeout")
       .defaultValue(String.valueOf(10 * 60 * 1000))
       .sinceVersion("0.10.0")
       .withDocumentation("For DynamoDB based lock provider, the maximum number of milliseconds to wait for creating DynamoDB table");
 
+  // Ethan: Move the infer logic here
   public static final ConfigProperty<String> DYNAMODB_ENDPOINT_URL = ConfigProperty
       .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "endpoint_url")
       .noDefaultValue()
       .sinceVersion("0.10.1")
       .withDocumentation("For DynamoDB based lock provider, the url endpoint used for Amazon DynamoDB service."
-                         + " Useful for development with a local dynamodb instance.");
+          + " Useful for development with a local dynamodb instance.");
 }
diff --git a/hudi-aws/src/main/java/org/apache/hudi/config/HoodieAWSConfig.java b/hudi-aws/src/main/java/org/apache/hudi/config/HoodieAWSConfig.java
@@ -23,11 +23,12 @@
 import org.apache.hudi.common.config.ConfigProperty;
 import org.apache.hudi.common.config.HoodieConfig;
 
+import javax.annotation.concurrent.Immutable;
+
 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.Properties;
-import javax.annotation.concurrent.Immutable;
 
 import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_BILLING_MODE;
 import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_PARTITION_KEY;
@@ -41,20 +42,23 @@
  */
 @Immutable
 @ConfigClassProperty(name = "Amazon Web Services Configs",
-        groupName = ConfigGroups.Names.AWS,
-        description = "Amazon Web Services configurations to access resources like Amazon DynamoDB (for locks),"
-            + " Amazon CloudWatch (metrics).")
+    groupName = ConfigGroups.Names.AWS,
+    description = "Amazon Web Services configurations to access resources like Amazon DynamoDB (for locks),"
+        + " Amazon CloudWatch (metrics).")
 public class HoodieAWSConfig extends HoodieConfig {
+  // AWS Access Configs
+
+  // Ethan: These configs should only be necessary if no environmental variables are not set (to confirm with code)
   public static final ConfigProperty<String> AWS_ACCESS_KEY = ConfigProperty
-        .key("hoodie.aws.access.key")
-        .noDefaultValue()
-        .sinceVersion("0.10.0")
-        .withDocumentation("AWS access key id");
+      .key("hoodie.aws.access.key")
+      .noDefaultValue()
+      .sinceVersion("0.10.0")
+      .withDocumentation("AWS access key id");
 
   public static final ConfigProperty<String> AWS_SECRET_KEY = ConfigProperty
-        .key("hoodie.aws.secret.key")
-        .noDefaultValue()
-        .sinceVersion("0.10.0")
+      .key("hoodie.aws.secret.key")
+      .noDefaultValue()
+      .sinceVersion("0.10.0")
         .withDocumentation("AWS secret key");
 
   public static final ConfigProperty<String> AWS_SESSION_TOKEN = ConfigProperty

diff --git a/...-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieArchivalConfig.java b/...-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieArchivalConfig.java
@@ -38,14 +38,17 @@
     groupName = ConfigGroups.Names.WRITE_CLIENT,
     description = "Configurations that control archival.")
 public class HoodieArchivalConfig extends HoodieConfig {
+  //cfg
 
+  // Ethan: Should we remove this?  Archive should always be enabled?
   public static final ConfigProperty<String> AUTO_ARCHIVE = ConfigProperty
       .key("hoodie.archive.automatic")
       .defaultValue("true")
       .withDocumentation("When enabled, the archival table service is invoked immediately after each commit,"
           + " to archive commits if we cross a maximum value of commits."
           + " It's recommended to enable this, to ensure number of active commits is bounded.");
 
+  // Ethan: Is Async archive tested?
   public static final ConfigProperty<String> ASYNC_ARCHIVE = ConfigProperty
       .key("hoodie.archive.async")
       .defaultValue("false")
@@ -60,11 +63,13 @@ public class HoodieArchivalConfig extends HoodieConfig {
           + " keep the metadata overhead constant, even as the table size grows."
           + " This config controls the maximum number of instants to retain in the active timeline. ");
 
+  // Ethan: Can we use Spark parallelism or auto-derive the configs and remove this?
   public static final ConfigProperty<Integer> DELETE_ARCHIVED_INSTANT_PARALLELISM_VALUE = ConfigProperty
       .key("hoodie.archive.delete.parallelism")
       .defaultValue(100)
       .withDocumentation("Parallelism for deleting archived hoodie commits.");
 
+  // Ethan: pick one of "hoodie.keep.min.commits" and "hoodie.keep.max.commits" only?
   public static final ConfigProperty<String> MIN_COMMITS_TO_KEEP = ConfigProperty
       .key("hoodie.keep.min.commits")
       .defaultValue("20")
@@ -93,6 +98,7 @@ public class HoodieArchivalConfig extends HoodieConfig {
       .withDocumentation("When enable, hoodie will auto merge several small archive files into larger one. It's"
           + " useful when storage scheme doesn't support append operation.");
 
+  // Ethan: Once this is well tested, we should remove the feature flag
   public static final ConfigProperty<Boolean> ARCHIVE_BEYOND_SAVEPOINT = ConfigProperty
       .key("hoodie.archive.beyond.savepoint")
       .defaultValue(false)

diff --git a/...client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java b/...client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java
@@ -46,6 +46,7 @@
         + "The bootstrap operation can flexibly avoid copying data over before you can use Hudi and support running the existing "
         + " writers and new hudi writers in parallel, to validate the migration.")
 public class HoodieBootstrapConfig extends HoodieConfig {
+  // Configs for Bootstrap feature
 
   public static final ConfigProperty<String> BASE_PATH = ConfigProperty
       .key("hoodie.bootstrap.base.path")
@@ -74,24 +75,28 @@ public class HoodieBootstrapConfig extends HoodieConfig {
       .sinceVersion("0.6.0")
       .withDocumentation("Class to use for reading the bootstrap dataset partitions/files, for Bootstrap mode FULL_RECORD");
 
+  // Ethan: can this be aligned with key gen for write (no need for a separate bootstrap key gen config)?
   public static final ConfigProperty<String> KEYGEN_CLASS_NAME = ConfigProperty
       .key("hoodie.bootstrap.keygen.class")
       .noDefaultValue()
       .sinceVersion("0.6.0")
       .withDocumentation("Key generator implementation to be used for generating keys from the bootstrapped dataset");
 
+  // Ethan: similar here
   public static final ConfigProperty<String> KEYGEN_TYPE = ConfigProperty
       .key("hoodie.bootstrap.keygen.type")
       .defaultValue(KeyGeneratorType.SIMPLE.name())
       .sinceVersion("0.9.0")
       .withDocumentation("Type of build-in key generator, currently support SIMPLE, COMPLEX, TIMESTAMP, CUSTOM, NON_PARTITION, GLOBAL_DELETE");
 
+  // Ethan: better default translator: if seeing hive-style partitioning, automatically parse the partition value?
   public static final ConfigProperty<String> PARTITION_PATH_TRANSLATOR_CLASS_NAME = ConfigProperty
       .key("hoodie.bootstrap.partitionpath.translator.class")
       .defaultValue(IdentityBootstrapPartitionPathTranslator.class.getName())
       .sinceVersion("0.6.0")
       .withDocumentation("Translates the partition paths from the bootstrapped data into how is laid out as a Hudi table.");
 
+  // Ethan: can this be aligned with Spark parallelism?
   public static final ConfigProperty<String> PARALLELISM_VALUE = ConfigProperty
       .key("hoodie.bootstrap.parallelism")
       .defaultValue("1500")

diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCleanConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieCleanConfig.java
@@ -29,6 +29,7 @@
 import org.apache.hudi.table.action.clean.CleaningTriggerStrategy;
 
 import javax.annotation.concurrent.Immutable;
+
 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
@@ -41,10 +42,12 @@
  */
 @Immutable
 @ConfigClassProperty(name = "Clean Configs",
-        groupName = ConfigGroups.Names.WRITE_CLIENT,
-        description = "Cleaning (reclamation of older/unused file groups/slices).")
+    groupName = ConfigGroups.Names.WRITE_CLIENT,
+    description = "Cleaning (reclamation of older/unused file groups/slices).")
 public class HoodieCleanConfig extends HoodieConfig {
+  // Configs for Clean action
 
+  // Ethan: should we standardize on `.enable` instead of `.automatic`?
   public static final ConfigProperty<String> AUTO_CLEAN = ConfigProperty
       .key("hoodie.clean.automatic")
       .defaultValue("true")
@@ -58,12 +61,16 @@ public class HoodieCleanConfig extends HoodieConfig {
       .withDocumentation("Only applies when " + AUTO_CLEAN.key() + " is turned on. "
           + "When turned on runs cleaner async with writing, which can speed up overall write performance.");
 
+  // Ethan: when either "hoodie.cleaner.commits.retained", "hoodie.cleaner.hours.retained",
+  // or "hoodie.cleaner.fileversions.retained" is set, should we automatically use the
+  // corresponding clean policy?
   public static final ConfigProperty<String> CLEANER_COMMITS_RETAINED = ConfigProperty
       .key("hoodie.cleaner.commits.retained")
       .defaultValue("10")
       .withDocumentation("Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits "
           + "(scheduled). This also directly translates into how much data retention the table supports for incremental queries.");
 
+  // Ethan: same here
   public static final ConfigProperty<String> CLEANER_HOURS_RETAINED = ConfigProperty.key("hoodie.cleaner.hours.retained")
       .defaultValue("24")
       .withDocumentation("Number of hours for which commits need to be retained. This config provides a more flexible option as"
@@ -84,11 +91,13 @@ public class HoodieCleanConfig extends HoodieConfig {
       .withDocumentation("Controls how cleaning is scheduled. Valid options: "
           + Arrays.stream(CleaningTriggerStrategy.values()).map(Enum::name).collect(Collectors.joining(",")));
 
+  // Ethan: "hoodie.clean.trigger.max.commits" is what this means
   public static final ConfigProperty<String> CLEAN_MAX_COMMITS = ConfigProperty
       .key("hoodie.clean.max.commits")
       .defaultValue("1")
       .withDocumentation("Number of commits after the last clean operation, before scheduling of a new clean is attempted.");
 
+  // Ethan: same here
   public static final ConfigProperty<String> CLEANER_FILE_VERSIONS_RETAINED = ConfigProperty
       .key("hoodie.cleaner.fileversions.retained")
       .defaultValue("3")
@@ -102,6 +111,7 @@ public class HoodieCleanConfig extends HoodieConfig {
           + " in the timeline, since the last cleaner run. This is much more efficient than obtaining listings for the full"
           + " table for each planning (even with a metadata table).");
 
+  // Ethan: this should be an internal config, based on concurrency control mode.  Should not be tweaked by user.
   public static final ConfigProperty<String> FAILED_WRITES_CLEANER_POLICY = ConfigProperty
       .key("hoodie.cleaner.policy.failed.writes")
       .defaultValue(HoodieFailedWritesCleaningPolicy.EAGER.name())
@@ -117,18 +127,21 @@ public class HoodieCleanConfig extends HoodieConfig {
           + "failed writes to re-claim space. Choose to perform this rollback of failed writes eagerly before "
           + "every writer starts (only supported for single writer) or lazily by the cleaner (required for multi-writers)");
 
+  // Ethan: same here.  Can we reuse Spark parallelism or derive automatically?
   public static final ConfigProperty<String> CLEANER_PARALLELISM_VALUE = ConfigProperty
       .key("hoodie.cleaner.parallelism")
       .defaultValue("200")
       .withDocumentation("Parallelism for the cleaning operation. Increase this if cleaning becomes slow.");
 
+  // Ethan: wondering if this is still needed.
   public static final ConfigProperty<Boolean> ALLOW_MULTIPLE_CLEANS = ConfigProperty
       .key("hoodie.clean.allow.multiple")
       .defaultValue(true)
       .sinceVersion("0.11.0")
       .withDocumentation("Allows scheduling/executing multiple cleans by enabling this config. If users prefer to strictly ensure clean requests should be mutually exclusive, "
           + ".i.e. a 2nd clean will not be scheduled if another clean is not yet completed to avoid repeat cleaning of same files, they might want to disable this config.");
 
+  // Ethan: should the default behavior be deleting original parquet files and then we can remove this config?
   public static final ConfigProperty<String> CLEANER_BOOTSTRAP_BASE_FILE_ENABLE = ConfigProperty
       .key("hoodie.cleaner.delete.bootstrap.base.file")
       .defaultValue("false")

diff --git a/...lient/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java b/...lient/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieClusteringConfig.java
@@ -47,6 +47,7 @@
     description = "Configurations that control the clustering table service in hudi, "
         + "which optimizes the storage layout for better query performance by sorting and sizing data files.")
 public class HoodieClusteringConfig extends HoodieConfig {
+  // Configs for Clustering operations
 
   // Any strategy specific params can be saved with this prefix
   public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy.";
@@ -90,6 +91,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
       .withDocumentation("End partition used to filter partition (inclusive), only effective when the filter mode '"
           + PLAN_PARTITION_FILTER_MODE + "' is " + ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name());
 
+  // Ethan: Should this be aligned with default parquet base file size?
   public static final ConfigProperty<String> PLAN_STRATEGY_SMALL_FILE_LIMIT = ConfigProperty
       .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "small.file.limit")
       .defaultValue(String.valueOf(300 * 1024 * 1024L))
@@ -108,6 +110,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
       .sinceVersion("0.11.0")
       .withDocumentation("Partitions to run clustering");
 
+  // Ethan: enum instead of class name to determine strategy type?  So this can be engine-agnostic
   public static final ConfigProperty<String> PLAN_STRATEGY_CLASS_NAME = ConfigProperty
       .key("hoodie.clustering.plan.strategy.class")
       .defaultValue(SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY)
@@ -131,18 +134,21 @@ public class HoodieClusteringConfig extends HoodieConfig {
       .withDocumentation("Turn on inline clustering - clustering will be run after each write operation is complete")
       .withAlternatives("hoodie.datasource.clustering.inline.enable");
 
+  // Ethan: this can apply to both inline or aysnc clustering.  Rename it for better readability.
   public static final ConfigProperty<String> INLINE_CLUSTERING_MAX_COMMITS = ConfigProperty
       .key("hoodie.clustering.inline.max.commits")
       .defaultValue("4")
       .sinceVersion("0.7.0")
       .withDocumentation("Config to control frequency of clustering planning");
 
+  // Ethan: should be merged with inline one
   public static final ConfigProperty<String> ASYNC_CLUSTERING_MAX_COMMITS = ConfigProperty
       .key("hoodie.clustering.async.max.commits")
       .defaultValue("4")
       .sinceVersion("0.9.0")
       .withDocumentation("Config to control frequency of async clustering");
 
+  // Ethan: better docs or configs for daybased strategy (to read code and figure this out)
   public static final ConfigProperty<String> PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST = ConfigProperty
       .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.skipfromlatest.partitions")
       .defaultValue("0")
@@ -162,6 +168,9 @@ public class HoodieClusteringConfig extends HoodieConfig {
           + "DAY_ROLLING: clustering partitions on a rolling basis by the hour to avoid clustering all partitions each time, "
           + "which strategy sorts the partitions asc and chooses the partition of which index is divided by 24 and the remainder is equal to the current hour.");
 
+  // Ethan: all these fine-tuning knobs can be simplified if we have a config to indicate the purpose:
+  // increase file size only, or sorting is also required.  And we derive a better defaults for these
+  // without having to ask users to configure them
   public static final ConfigProperty<String> PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP = ConfigProperty
       .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "max.bytes.per.group")
       .defaultValue(String.valueOf(2 * 1024 * 1024 * 1024L))
@@ -201,6 +210,8 @@ public class HoodieClusteringConfig extends HoodieConfig {
       .withDocumentation("Determines how to handle updates, deletes to file groups that are under clustering."
           + " Default strategy just rejects the update");
 
+  // Ethan: these configs of inline, async, schedule, execute can be merged into one mode config:
+  // e.g., none, schedule_only, schedule_and_execute, etc.
   public static final ConfigProperty<String> SCHEDULE_INLINE_CLUSTERING = ConfigProperty
       .key("hoodie.clustering.schedule.inline")
       .defaultValue("false")
@@ -218,6 +229,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
       .withDocumentation("Enable running of clustering service, asynchronously as inserts happen on the table.")
       .withAlternatives("hoodie.datasource.clustering.async.enable");
 
+  // Ethan: this should be removed.  User should not tweak this to corrupt the hoodie_commit_time meta field.
   public static final ConfigProperty<Boolean> PRESERVE_COMMIT_METADATA = ConfigProperty
       .key("hoodie.clustering.preserve.commit.metadata")
       .defaultValue(true)