-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-5739] Review Hudi Config Defaults #7912
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
337ab69
85d0754
2b6ac18
a5e96b5
5a345a0
d0be686
debd0a9
4a1bae3
0dbcb68
e047329
ca8557f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,10 +40,13 @@ | |
| + " between writers to a Hudi table. Concurrency between Hudi's own table services " | ||
| + " are auto managed internally.") | ||
| public class DynamoDbBasedLockConfig extends HoodieConfig { | ||
| // Config for DynamoDB based lock provider | ||
|
|
||
| // configs for DynamoDb based locks | ||
| public static final String DYNAMODB_BASED_LOCK_PROPERTY_PREFIX = LOCK_PREFIX + "dynamodb."; | ||
|
|
||
| // Ethan: Can we provide a default value here that is less likely to collide, | ||
| // So this config also becomes optional? | ||
| public static final ConfigProperty<String> DYNAMODB_LOCK_TABLE_NAME = ConfigProperty | ||
| .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "table") | ||
| .noDefaultValue() | ||
|
|
@@ -78,6 +81,7 @@ public class DynamoDbBasedLockConfig extends HoodieConfig { | |
| .withDocumentation("For DynamoDB based lock provider, the region used in endpoint for Amazon DynamoDB service." | ||
| + " Would try to first get it from AWS_REGION environment variable. If not find, by default use us-east-1"); | ||
|
|
||
| // Ethan: Need to confirm these are good defaults for cost | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is fine.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sg |
||
| public static final ConfigProperty<String> DYNAMODB_LOCK_BILLING_MODE = ConfigProperty | ||
| .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "billing_mode") | ||
| .defaultValue(BillingMode.PAY_PER_REQUEST.name()) | ||
|
|
@@ -96,16 +100,18 @@ public class DynamoDbBasedLockConfig extends HoodieConfig { | |
| .sinceVersion("0.10.0") | ||
| .withDocumentation("For DynamoDB based lock provider, write capacity units when using PROVISIONED billing mode"); | ||
|
|
||
| // Ethan: 10 min. Is this timeout too long? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shorten to 2?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5782 |
||
| public static final ConfigProperty<String> DYNAMODB_LOCK_TABLE_CREATION_TIMEOUT = ConfigProperty | ||
| .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "table_creation_timeout") | ||
| .defaultValue(String.valueOf(10 * 60 * 1000)) | ||
| .sinceVersion("0.10.0") | ||
| .withDocumentation("For DynamoDB based lock provider, the maximum number of milliseconds to wait for creating DynamoDB table"); | ||
|
|
||
| // Ethan: Move the infer logic here | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 for sane default
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5892 |
||
| public static final ConfigProperty<String> DYNAMODB_ENDPOINT_URL = ConfigProperty | ||
| .key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "endpoint_url") | ||
| .noDefaultValue() | ||
| .sinceVersion("0.10.1") | ||
| .withDocumentation("For DynamoDB based lock provider, the url endpoint used for Amazon DynamoDB service." | ||
| + " Useful for development with a local dynamodb instance."); | ||
| + " Useful for development with a local dynamodb instance."); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,11 +23,12 @@ | |
| import org.apache.hudi.common.config.ConfigProperty; | ||
| import org.apache.hudi.common.config.HoodieConfig; | ||
|
|
||
| import javax.annotation.concurrent.Immutable; | ||
|
|
||
| import java.io.File; | ||
| import java.io.FileReader; | ||
| import java.io.IOException; | ||
| import java.util.Properties; | ||
| import javax.annotation.concurrent.Immutable; | ||
|
|
||
| import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_BILLING_MODE; | ||
| import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_PARTITION_KEY; | ||
|
|
@@ -41,20 +42,23 @@ | |
| */ | ||
| @Immutable | ||
| @ConfigClassProperty(name = "Amazon Web Services Configs", | ||
| groupName = ConfigGroups.Names.AWS, | ||
| description = "Amazon Web Services configurations to access resources like Amazon DynamoDB (for locks)," | ||
| + " Amazon CloudWatch (metrics).") | ||
| groupName = ConfigGroups.Names.AWS, | ||
| description = "Amazon Web Services configurations to access resources like Amazon DynamoDB (for locks)," | ||
| + " Amazon CloudWatch (metrics).") | ||
| public class HoodieAWSConfig extends HoodieConfig { | ||
| // AWS Access Configs | ||
|
|
||
| // Ethan: These configs should only be necessary if no environmental variables are not set (to confirm with code) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5892 |
||
| public static final ConfigProperty<String> AWS_ACCESS_KEY = ConfigProperty | ||
| .key("hoodie.aws.access.key") | ||
| .noDefaultValue() | ||
| .sinceVersion("0.10.0") | ||
| .withDocumentation("AWS access key id"); | ||
| .key("hoodie.aws.access.key") | ||
| .noDefaultValue() | ||
| .sinceVersion("0.10.0") | ||
| .withDocumentation("AWS access key id"); | ||
|
|
||
| public static final ConfigProperty<String> AWS_SECRET_KEY = ConfigProperty | ||
| .key("hoodie.aws.secret.key") | ||
| .noDefaultValue() | ||
| .sinceVersion("0.10.0") | ||
| .key("hoodie.aws.secret.key") | ||
| .noDefaultValue() | ||
| .sinceVersion("0.10.0") | ||
| .withDocumentation("AWS secret key"); | ||
|
|
||
| public static final ConfigProperty<String> AWS_SESSION_TOKEN = ConfigProperty | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,14 +38,17 @@ | |
| groupName = ConfigGroups.Names.WRITE_CLIENT, | ||
| description = "Configurations that control archival.") | ||
| public class HoodieArchivalConfig extends HoodieConfig { | ||
| //cfg | ||
|
|
||
| // Ethan: Should we remove this? Archive should always be enabled? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this become an unsafe/advanced config instead?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5893 |
||
| public static final ConfigProperty<String> AUTO_ARCHIVE = ConfigProperty | ||
| .key("hoodie.archive.automatic") | ||
| .defaultValue("true") | ||
| .withDocumentation("When enabled, the archival table service is invoked immediately after each commit," | ||
| + " to archive commits if we cross a maximum value of commits." | ||
| + " It's recommended to enable this, to ensure number of active commits is bounded."); | ||
|
|
||
| // Ethan: Is Async archive tested? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should ensure this is safe.. for now advanced config
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mark for table service reliability efforts.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Essential/advanced config tracked in HUDI-5893. |
||
| public static final ConfigProperty<String> ASYNC_ARCHIVE = ConfigProperty | ||
| .key("hoodie.archive.async") | ||
| .defaultValue("false") | ||
|
|
@@ -60,11 +63,13 @@ public class HoodieArchivalConfig extends HoodieConfig { | |
| + " keep the metadata overhead constant, even as the table size grows." | ||
| + " This config controls the maximum number of instants to retain in the active timeline. "); | ||
|
|
||
| // Ethan: Can we use Spark parallelism or auto-derive the configs and remove this? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. idk if there is a way to do it more intelligently, workload based. i.e based on number of isntants deleted or sth?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tbh idk if this is a real problem to go after.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5894 |
||
| public static final ConfigProperty<Integer> DELETE_ARCHIVED_INSTANT_PARALLELISM_VALUE = ConfigProperty | ||
| .key("hoodie.archive.delete.parallelism") | ||
| .defaultValue(100) | ||
| .withDocumentation("Parallelism for deleting archived hoodie commits."); | ||
|
|
||
| // Ethan: pick one of "hoodie.keep.min.commits" and "hoodie.keep.max.commits" only? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. min/max was added to batch up the archived files. we leave this alone IMO>
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree. We can leave this as is. |
||
| public static final ConfigProperty<String> MIN_COMMITS_TO_KEEP = ConfigProperty | ||
| .key("hoodie.keep.min.commits") | ||
| .defaultValue("20") | ||
|
|
@@ -93,6 +98,7 @@ public class HoodieArchivalConfig extends HoodieConfig { | |
| .withDocumentation("When enable, hoodie will auto merge several small archive files into larger one. It's" | ||
| + " useful when storage scheme doesn't support append operation."); | ||
|
|
||
| // Ethan: Once this is well tested, we should remove the feature flag | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nsivabalan could we remove this config now? |
||
| public static final ConfigProperty<Boolean> ARCHIVE_BEYOND_SAVEPOINT = ConfigProperty | ||
| .key("hoodie.archive.beyond.savepoint") | ||
| .defaultValue(false) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,6 +46,7 @@ | |
| + "The bootstrap operation can flexibly avoid copying data over before you can use Hudi and support running the existing " | ||
| + " writers and new hudi writers in parallel, to validate the migration.") | ||
| public class HoodieBootstrapConfig extends HoodieConfig { | ||
| // Configs for Bootstrap feature | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a lot of them may overlap with write configs. good low hanging fruits here.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5895 |
||
|
|
||
| public static final ConfigProperty<String> BASE_PATH = ConfigProperty | ||
| .key("hoodie.bootstrap.base.path") | ||
|
|
@@ -74,24 +75,28 @@ public class HoodieBootstrapConfig extends HoodieConfig { | |
| .sinceVersion("0.6.0") | ||
| .withDocumentation("Class to use for reading the bootstrap dataset partitions/files, for Bootstrap mode FULL_RECORD"); | ||
|
|
||
| // Ethan: can this be aligned with key gen for write (no need for a separate bootstrap key gen config)? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 sane defaults
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5895 |
||
| public static final ConfigProperty<String> KEYGEN_CLASS_NAME = ConfigProperty | ||
| .key("hoodie.bootstrap.keygen.class") | ||
| .noDefaultValue() | ||
| .sinceVersion("0.6.0") | ||
| .withDocumentation("Key generator implementation to be used for generating keys from the bootstrapped dataset"); | ||
|
|
||
| // Ethan: similar here | ||
| public static final ConfigProperty<String> KEYGEN_TYPE = ConfigProperty | ||
| .key("hoodie.bootstrap.keygen.type") | ||
| .defaultValue(KeyGeneratorType.SIMPLE.name()) | ||
| .sinceVersion("0.9.0") | ||
| .withDocumentation("Type of build-in key generator, currently support SIMPLE, COMPLEX, TIMESTAMP, CUSTOM, NON_PARTITION, GLOBAL_DELETE"); | ||
|
|
||
| // Ethan: better default translator: if seeing hive-style partitioning, automatically parse the partition value? | ||
| public static final ConfigProperty<String> PARTITION_PATH_TRANSLATOR_CLASS_NAME = ConfigProperty | ||
| .key("hoodie.bootstrap.partitionpath.translator.class") | ||
| .defaultValue(IdentityBootstrapPartitionPathTranslator.class.getName()) | ||
| .sinceVersion("0.6.0") | ||
| .withDocumentation("Translates the partition paths from the bootstrapped data into how is laid out as a Hudi table."); | ||
|
|
||
| // Ethan: can this be aligned with Spark parallelism? | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5894 |
||
| public static final ConfigProperty<String> PARALLELISM_VALUE = ConfigProperty | ||
| .key("hoodie.bootstrap.parallelism") | ||
| .defaultValue("1500") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,7 @@ | |
| import org.apache.hudi.table.action.clean.CleaningTriggerStrategy; | ||
|
|
||
| import javax.annotation.concurrent.Immutable; | ||
|
|
||
| import java.io.File; | ||
| import java.io.FileReader; | ||
| import java.io.IOException; | ||
|
|
@@ -41,10 +42,12 @@ | |
| */ | ||
| @Immutable | ||
| @ConfigClassProperty(name = "Clean Configs", | ||
| groupName = ConfigGroups.Names.WRITE_CLIENT, | ||
| description = "Cleaning (reclamation of older/unused file groups/slices).") | ||
| groupName = ConfigGroups.Names.WRITE_CLIENT, | ||
| description = "Cleaning (reclamation of older/unused file groups/slices).") | ||
| public class HoodieCleanConfig extends HoodieConfig { | ||
| // Configs for Clean action | ||
|
|
||
| // Ethan: should we standardize on `.enable` instead of `.automatic`? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 as long as its backwards compatible.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5896 |
||
| public static final ConfigProperty<String> AUTO_CLEAN = ConfigProperty | ||
| .key("hoodie.clean.automatic") | ||
| .defaultValue("true") | ||
|
|
@@ -58,12 +61,16 @@ public class HoodieCleanConfig extends HoodieConfig { | |
| .withDocumentation("Only applies when " + AUTO_CLEAN.key() + " is turned on. " | ||
| + "When turned on runs cleaner async with writing, which can speed up overall write performance."); | ||
|
|
||
| // Ethan: when either "hoodie.cleaner.commits.retained", "hoodie.cleaner.hours.retained", | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so, we infer the cleaning policy and throw errors if multiple of them are set?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. Tracked in HUDI-5892. |
||
| // or "hoodie.cleaner.fileversions.retained" is set, should we automatically use the | ||
| // corresponding clean policy? | ||
| public static final ConfigProperty<String> CLEANER_COMMITS_RETAINED = ConfigProperty | ||
| .key("hoodie.cleaner.commits.retained") | ||
| .defaultValue("10") | ||
| .withDocumentation("Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits " | ||
| + "(scheduled). This also directly translates into how much data retention the table supports for incremental queries."); | ||
|
|
||
| // Ethan: same here | ||
| public static final ConfigProperty<String> CLEANER_HOURS_RETAINED = ConfigProperty.key("hoodie.cleaner.hours.retained") | ||
| .defaultValue("24") | ||
| .withDocumentation("Number of hours for which commits need to be retained. This config provides a more flexible option as" | ||
|
|
@@ -84,11 +91,13 @@ public class HoodieCleanConfig extends HoodieConfig { | |
| .withDocumentation("Controls how cleaning is scheduled. Valid options: " | ||
| + Arrays.stream(CleaningTriggerStrategy.values()).map(Enum::name).collect(Collectors.joining(","))); | ||
|
|
||
| // Ethan: "hoodie.clean.trigger.max.commits" is what this means | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 again on aligining with how we talk about scheduling and executing table services.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5896 |
||
| public static final ConfigProperty<String> CLEAN_MAX_COMMITS = ConfigProperty | ||
| .key("hoodie.clean.max.commits") | ||
| .defaultValue("1") | ||
| .withDocumentation("Number of commits after the last clean operation, before scheduling of a new clean is attempted."); | ||
|
|
||
| // Ethan: same here | ||
| public static final ConfigProperty<String> CLEANER_FILE_VERSIONS_RETAINED = ConfigProperty | ||
| .key("hoodie.cleaner.fileversions.retained") | ||
| .defaultValue("3") | ||
|
|
@@ -102,6 +111,7 @@ public class HoodieCleanConfig extends HoodieConfig { | |
| + " in the timeline, since the last cleaner run. This is much more efficient than obtaining listings for the full" | ||
| + " table for each planning (even with a metadata table)."); | ||
|
|
||
| // Ethan: this should be an internal config, based on concurrency control mode. Should not be tweaked by user. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make this advanced and move on.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5893 |
||
| public static final ConfigProperty<String> FAILED_WRITES_CLEANER_POLICY = ConfigProperty | ||
| .key("hoodie.cleaner.policy.failed.writes") | ||
| .defaultValue(HoodieFailedWritesCleaningPolicy.EAGER.name()) | ||
|
|
@@ -117,18 +127,21 @@ public class HoodieCleanConfig extends HoodieConfig { | |
| + "failed writes to re-claim space. Choose to perform this rollback of failed writes eagerly before " | ||
| + "every writer starts (only supported for single writer) or lazily by the cleaner (required for multi-writers)"); | ||
|
|
||
| // Ethan: same here. Can we reuse Spark parallelism or derive automatically? | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5894 |
||
| public static final ConfigProperty<String> CLEANER_PARALLELISM_VALUE = ConfigProperty | ||
| .key("hoodie.cleaner.parallelism") | ||
| .defaultValue("200") | ||
| .withDocumentation("Parallelism for the cleaning operation. Increase this if cleaning becomes slow."); | ||
|
|
||
| // Ethan: wondering if this is still needed. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nsivabalan do you think we still need this? |
||
| public static final ConfigProperty<Boolean> ALLOW_MULTIPLE_CLEANS = ConfigProperty | ||
| .key("hoodie.clean.allow.multiple") | ||
| .defaultValue(true) | ||
| .sinceVersion("0.11.0") | ||
| .withDocumentation("Allows scheduling/executing multiple cleans by enabling this config. If users prefer to strictly ensure clean requests should be mutually exclusive, " | ||
| + ".i.e. a 2nd clean will not be scheduled if another clean is not yet completed to avoid repeat cleaning of same files, they might want to disable this config."); | ||
|
|
||
| // Ethan: should the default behavior be deleting original parquet files and then we can remove this config? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should ask Udit, why this is the default
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5895 |
||
| public static final ConfigProperty<String> CLEANER_BOOTSTRAP_BASE_FILE_ENABLE = ConfigProperty | ||
| .key("hoodie.cleaner.delete.bootstrap.base.file") | ||
| .defaultValue("false") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,6 +47,7 @@ | |
| description = "Configurations that control the clustering table service in hudi, " | ||
| + "which optimizes the storage layout for better query performance by sorting and sizing data files.") | ||
| public class HoodieClusteringConfig extends HoodieConfig { | ||
| // Configs for Clustering operations | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. revisit all docs here, and make it very user friendly
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5897 |
||
|
|
||
| // Any strategy specific params can be saved with this prefix | ||
| public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy."; | ||
|
|
@@ -90,6 +91,7 @@ public class HoodieClusteringConfig extends HoodieConfig { | |
| .withDocumentation("End partition used to filter partition (inclusive), only effective when the filter mode '" | ||
| + PLAN_PARTITION_FILTER_MODE + "' is " + ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name()); | ||
|
|
||
| // Ethan: Should this be aligned with default parquet base file size? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we could infer this from base file size.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would read the code though to understand if it has side effects i.e the Hudi will always leave 100MB files, so will clustering ever kick in? we need to trigger clustering based on col stats and how clustered the table is. |
||
| public static final ConfigProperty<String> PLAN_STRATEGY_SMALL_FILE_LIMIT = ConfigProperty | ||
| .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "small.file.limit") | ||
| .defaultValue(String.valueOf(300 * 1024 * 1024L)) | ||
|
|
@@ -108,6 +110,7 @@ public class HoodieClusteringConfig extends HoodieConfig { | |
| .sinceVersion("0.11.0") | ||
| .withDocumentation("Partitions to run clustering"); | ||
|
|
||
| // Ethan: enum instead of class name to determine strategy type? So this can be engine-agnostic | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5892 |
||
| public static final ConfigProperty<String> PLAN_STRATEGY_CLASS_NAME = ConfigProperty | ||
| .key("hoodie.clustering.plan.strategy.class") | ||
| .defaultValue(SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY) | ||
|
|
@@ -131,18 +134,21 @@ public class HoodieClusteringConfig extends HoodieConfig { | |
| .withDocumentation("Turn on inline clustering - clustering will be run after each write operation is complete") | ||
| .withAlternatives("hoodie.datasource.clustering.inline.enable"); | ||
|
|
||
| // Ethan: this can apply to both inline or aysnc clustering. Rename it for better readability. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5896 |
||
| public static final ConfigProperty<String> INLINE_CLUSTERING_MAX_COMMITS = ConfigProperty | ||
| .key("hoodie.clustering.inline.max.commits") | ||
| .defaultValue("4") | ||
| .sinceVersion("0.7.0") | ||
| .withDocumentation("Config to control frequency of clustering planning"); | ||
|
|
||
| // Ethan: should be merged with inline one | ||
| public static final ConfigProperty<String> ASYNC_CLUSTERING_MAX_COMMITS = ConfigProperty | ||
| .key("hoodie.clustering.async.max.commits") | ||
| .defaultValue("4") | ||
| .sinceVersion("0.9.0") | ||
| .withDocumentation("Config to control frequency of async clustering"); | ||
|
|
||
| // Ethan: better docs or configs for daybased strategy (to read code and figure this out) | ||
| public static final ConfigProperty<String> PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST = ConfigProperty | ||
| .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.skipfromlatest.partitions") | ||
| .defaultValue("0") | ||
|
|
@@ -162,6 +168,9 @@ public class HoodieClusteringConfig extends HoodieConfig { | |
| + "DAY_ROLLING: clustering partitions on a rolling basis by the hour to avoid clustering all partitions each time, " | ||
| + "which strategy sorts the partitions asc and chooses the partition of which index is divided by 24 and the remainder is equal to the current hour."); | ||
|
|
||
| // Ethan: all these fine-tuning knobs can be simplified if we have a config to indicate the purpose: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should simplify this. esp the default. IIRC we only cluster within the group? so picking group size is important?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 30 is kind of arbitrary.. We should rethink this fundamentally.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These groups were added to run clustering in smaller rounds, for large scale use-cases. We should make it simple for out of box medium scale data
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5892 |
||
| // increase file size only, or sorting is also required. And we derive a better defaults for these | ||
| // without having to ask users to configure them | ||
| public static final ConfigProperty<String> PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP = ConfigProperty | ||
| .key(CLUSTERING_STRATEGY_PARAM_PREFIX + "max.bytes.per.group") | ||
| .defaultValue(String.valueOf(2 * 1024 * 1024 * 1024L)) | ||
|
|
@@ -201,6 +210,8 @@ public class HoodieClusteringConfig extends HoodieConfig { | |
| .withDocumentation("Determines how to handle updates, deletes to file groups that are under clustering." | ||
| + " Default strategy just rejects the update"); | ||
|
|
||
| // Ethan: these configs of inline, async, schedule, execute can be merged into one mode config: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5896 |
||
| // e.g., none, schedule_only, schedule_and_execute, etc. | ||
| public static final ConfigProperty<String> SCHEDULE_INLINE_CLUSTERING = ConfigProperty | ||
| .key("hoodie.clustering.schedule.inline") | ||
| .defaultValue("false") | ||
|
|
@@ -218,6 +229,7 @@ public class HoodieClusteringConfig extends HoodieConfig { | |
| .withDocumentation("Enable running of clustering service, asynchronously as inserts happen on the table.") | ||
| .withAlternatives("hoodie.datasource.clustering.async.enable"); | ||
|
|
||
| // Ethan: this should be removed. User should not tweak this to corrupt the hoodie_commit_time meta field. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. user should not have control. we should preserve commit_time for internal write operations (compaction, clustering.. ) . Think about any historical debt, before making final call.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked in HUDI-5782 |
||
| public static final ConfigProperty<Boolean> PRESERVE_COMMIT_METADATA = ConfigProperty | ||
| .key("hoodie.clustering.preserve.commit.metadata") | ||
| .defaultValue(true) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the lock provider should do use sth like
hudi_locksas the default table name? then the user can override if needed.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tracked in HUDI-5782