Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,13 @@
+ " between writers to a Hudi table. Concurrency between Hudi's own table services "
+ " are auto managed internally.")
public class DynamoDbBasedLockConfig extends HoodieConfig {
// Config for DynamoDB based lock provider

// configs for DynamoDb based locks
public static final String DYNAMODB_BASED_LOCK_PROPERTY_PREFIX = LOCK_PREFIX + "dynamodb.";

// Ethan: Can we provide a default value here that is less likely to collide,
// So this config also becomes optional?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the lock provider should do use sth like hudi_locks as the default table name? then the user can override if needed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5782

public static final ConfigProperty<String> DYNAMODB_LOCK_TABLE_NAME = ConfigProperty
.key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "table")
.noDefaultValue()
Expand Down Expand Up @@ -78,6 +81,7 @@ public class DynamoDbBasedLockConfig extends HoodieConfig {
.withDocumentation("For DynamoDB based lock provider, the region used in endpoint for Amazon DynamoDB service."
+ " Would try to first get it from AWS_REGION environment variable. If not find, by default use us-east-1");

// Ethan: Need to confirm these are good defaults for cost
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is fine.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sg

public static final ConfigProperty<String> DYNAMODB_LOCK_BILLING_MODE = ConfigProperty
.key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "billing_mode")
.defaultValue(BillingMode.PAY_PER_REQUEST.name())
Expand All @@ -96,16 +100,18 @@ public class DynamoDbBasedLockConfig extends HoodieConfig {
.sinceVersion("0.10.0")
.withDocumentation("For DynamoDB based lock provider, write capacity units when using PROVISIONED billing mode");

// Ethan: 10 min. Is this timeout too long?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shorten to 2?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5782

public static final ConfigProperty<String> DYNAMODB_LOCK_TABLE_CREATION_TIMEOUT = ConfigProperty
.key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "table_creation_timeout")
.defaultValue(String.valueOf(10 * 60 * 1000))
.sinceVersion("0.10.0")
.withDocumentation("For DynamoDB based lock provider, the maximum number of milliseconds to wait for creating DynamoDB table");

// Ethan: Move the infer logic here
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for sane default

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5892

public static final ConfigProperty<String> DYNAMODB_ENDPOINT_URL = ConfigProperty
.key(DYNAMODB_BASED_LOCK_PROPERTY_PREFIX + "endpoint_url")
.noDefaultValue()
.sinceVersion("0.10.1")
.withDocumentation("For DynamoDB based lock provider, the url endpoint used for Amazon DynamoDB service."
+ " Useful for development with a local dynamodb instance.");
+ " Useful for development with a local dynamodb instance.");
}
26 changes: 15 additions & 11 deletions hudi-aws/src/main/java/org/apache/hudi/config/HoodieAWSConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;

import javax.annotation.concurrent.Immutable;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;
import javax.annotation.concurrent.Immutable;

import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_BILLING_MODE;
import static org.apache.hudi.config.DynamoDbBasedLockConfig.DYNAMODB_LOCK_PARTITION_KEY;
Expand All @@ -41,20 +42,23 @@
*/
@Immutable
@ConfigClassProperty(name = "Amazon Web Services Configs",
groupName = ConfigGroups.Names.AWS,
description = "Amazon Web Services configurations to access resources like Amazon DynamoDB (for locks),"
+ " Amazon CloudWatch (metrics).")
groupName = ConfigGroups.Names.AWS,
description = "Amazon Web Services configurations to access resources like Amazon DynamoDB (for locks),"
+ " Amazon CloudWatch (metrics).")
public class HoodieAWSConfig extends HoodieConfig {
// AWS Access Configs

// Ethan: These configs should only be necessary if no environmental variables are not set (to confirm with code)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5892

public static final ConfigProperty<String> AWS_ACCESS_KEY = ConfigProperty
.key("hoodie.aws.access.key")
.noDefaultValue()
.sinceVersion("0.10.0")
.withDocumentation("AWS access key id");
.key("hoodie.aws.access.key")
.noDefaultValue()
.sinceVersion("0.10.0")
.withDocumentation("AWS access key id");

public static final ConfigProperty<String> AWS_SECRET_KEY = ConfigProperty
.key("hoodie.aws.secret.key")
.noDefaultValue()
.sinceVersion("0.10.0")
.key("hoodie.aws.secret.key")
.noDefaultValue()
.sinceVersion("0.10.0")
.withDocumentation("AWS secret key");

public static final ConfigProperty<String> AWS_SESSION_TOKEN = ConfigProperty
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,17 @@
groupName = ConfigGroups.Names.WRITE_CLIENT,
description = "Configurations that control archival.")
public class HoodieArchivalConfig extends HoodieConfig {
//cfg

// Ethan: Should we remove this? Archive should always be enabled?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this become an unsafe/advanced config instead?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5893

public static final ConfigProperty<String> AUTO_ARCHIVE = ConfigProperty
.key("hoodie.archive.automatic")
.defaultValue("true")
.withDocumentation("When enabled, the archival table service is invoked immediately after each commit,"
+ " to archive commits if we cross a maximum value of commits."
+ " It's recommended to enable this, to ensure number of active commits is bounded.");

// Ethan: Is Async archive tested?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should ensure this is safe.. for now advanced config

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mark for table service reliability efforts.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Essential/advanced config tracked in HUDI-5893.

public static final ConfigProperty<String> ASYNC_ARCHIVE = ConfigProperty
.key("hoodie.archive.async")
.defaultValue("false")
Expand All @@ -60,11 +63,13 @@ public class HoodieArchivalConfig extends HoodieConfig {
+ " keep the metadata overhead constant, even as the table size grows."
+ " This config controls the maximum number of instants to retain in the active timeline. ");

// Ethan: Can we use Spark parallelism or auto-derive the configs and remove this?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

idk if there is a way to do it more intelligently, workload based. i.e based on number of isntants deleted or sth?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tbh idk if this is a real problem to go after.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5894

public static final ConfigProperty<Integer> DELETE_ARCHIVED_INSTANT_PARALLELISM_VALUE = ConfigProperty
.key("hoodie.archive.delete.parallelism")
.defaultValue(100)
.withDocumentation("Parallelism for deleting archived hoodie commits.");

// Ethan: pick one of "hoodie.keep.min.commits" and "hoodie.keep.max.commits" only?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

min/max was added to batch up the archived files. we leave this alone IMO>

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. We can leave this as is.

public static final ConfigProperty<String> MIN_COMMITS_TO_KEEP = ConfigProperty
.key("hoodie.keep.min.commits")
.defaultValue("20")
Expand Down Expand Up @@ -93,6 +98,7 @@ public class HoodieArchivalConfig extends HoodieConfig {
.withDocumentation("When enable, hoodie will auto merge several small archive files into larger one. It's"
+ " useful when storage scheme doesn't support append operation.");

// Ethan: Once this is well tested, we should remove the feature flag
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nsivabalan could we remove this config now?

public static final ConfigProperty<Boolean> ARCHIVE_BEYOND_SAVEPOINT = ConfigProperty
.key("hoodie.archive.beyond.savepoint")
.defaultValue(false)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
+ "The bootstrap operation can flexibly avoid copying data over before you can use Hudi and support running the existing "
+ " writers and new hudi writers in parallel, to validate the migration.")
public class HoodieBootstrapConfig extends HoodieConfig {
// Configs for Bootstrap feature
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a lot of them may overlap with write configs. good low hanging fruits here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5895


public static final ConfigProperty<String> BASE_PATH = ConfigProperty
.key("hoodie.bootstrap.base.path")
Expand Down Expand Up @@ -74,24 +75,28 @@ public class HoodieBootstrapConfig extends HoodieConfig {
.sinceVersion("0.6.0")
.withDocumentation("Class to use for reading the bootstrap dataset partitions/files, for Bootstrap mode FULL_RECORD");

// Ethan: can this be aligned with key gen for write (no need for a separate bootstrap key gen config)?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 sane defaults

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5895

public static final ConfigProperty<String> KEYGEN_CLASS_NAME = ConfigProperty
.key("hoodie.bootstrap.keygen.class")
.noDefaultValue()
.sinceVersion("0.6.0")
.withDocumentation("Key generator implementation to be used for generating keys from the bootstrapped dataset");

// Ethan: similar here
public static final ConfigProperty<String> KEYGEN_TYPE = ConfigProperty
.key("hoodie.bootstrap.keygen.type")
.defaultValue(KeyGeneratorType.SIMPLE.name())
.sinceVersion("0.9.0")
.withDocumentation("Type of build-in key generator, currently support SIMPLE, COMPLEX, TIMESTAMP, CUSTOM, NON_PARTITION, GLOBAL_DELETE");

// Ethan: better default translator: if seeing hive-style partitioning, automatically parse the partition value?
public static final ConfigProperty<String> PARTITION_PATH_TRANSLATOR_CLASS_NAME = ConfigProperty
.key("hoodie.bootstrap.partitionpath.translator.class")
.defaultValue(IdentityBootstrapPartitionPathTranslator.class.getName())
.sinceVersion("0.6.0")
.withDocumentation("Translates the partition paths from the bootstrapped data into how is laid out as a Hudi table.");

// Ethan: can this be aligned with Spark parallelism?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5894

public static final ConfigProperty<String> PARALLELISM_VALUE = ConfigProperty
.key("hoodie.bootstrap.parallelism")
.defaultValue("1500")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.apache.hudi.table.action.clean.CleaningTriggerStrategy;

import javax.annotation.concurrent.Immutable;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
Expand All @@ -41,10 +42,12 @@
*/
@Immutable
@ConfigClassProperty(name = "Clean Configs",
groupName = ConfigGroups.Names.WRITE_CLIENT,
description = "Cleaning (reclamation of older/unused file groups/slices).")
groupName = ConfigGroups.Names.WRITE_CLIENT,
description = "Cleaning (reclamation of older/unused file groups/slices).")
public class HoodieCleanConfig extends HoodieConfig {
// Configs for Clean action

// Ethan: should we standardize on `.enable` instead of `.automatic`?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 as long as its backwards compatible.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5896

public static final ConfigProperty<String> AUTO_CLEAN = ConfigProperty
.key("hoodie.clean.automatic")
.defaultValue("true")
Expand All @@ -58,12 +61,16 @@ public class HoodieCleanConfig extends HoodieConfig {
.withDocumentation("Only applies when " + AUTO_CLEAN.key() + " is turned on. "
+ "When turned on runs cleaner async with writing, which can speed up overall write performance.");

// Ethan: when either "hoodie.cleaner.commits.retained", "hoodie.cleaner.hours.retained",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so, we infer the cleaning policy and throw errors if multiple of them are set?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Tracked in HUDI-5892.

// or "hoodie.cleaner.fileversions.retained" is set, should we automatically use the
// corresponding clean policy?
public static final ConfigProperty<String> CLEANER_COMMITS_RETAINED = ConfigProperty
.key("hoodie.cleaner.commits.retained")
.defaultValue("10")
.withDocumentation("Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits "
+ "(scheduled). This also directly translates into how much data retention the table supports for incremental queries.");

// Ethan: same here
public static final ConfigProperty<String> CLEANER_HOURS_RETAINED = ConfigProperty.key("hoodie.cleaner.hours.retained")
.defaultValue("24")
.withDocumentation("Number of hours for which commits need to be retained. This config provides a more flexible option as"
Expand All @@ -84,11 +91,13 @@ public class HoodieCleanConfig extends HoodieConfig {
.withDocumentation("Controls how cleaning is scheduled. Valid options: "
+ Arrays.stream(CleaningTriggerStrategy.values()).map(Enum::name).collect(Collectors.joining(",")));

// Ethan: "hoodie.clean.trigger.max.commits" is what this means
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 again on aligining with how we talk about scheduling and executing table services.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5896

public static final ConfigProperty<String> CLEAN_MAX_COMMITS = ConfigProperty
.key("hoodie.clean.max.commits")
.defaultValue("1")
.withDocumentation("Number of commits after the last clean operation, before scheduling of a new clean is attempted.");

// Ethan: same here
public static final ConfigProperty<String> CLEANER_FILE_VERSIONS_RETAINED = ConfigProperty
.key("hoodie.cleaner.fileversions.retained")
.defaultValue("3")
Expand All @@ -102,6 +111,7 @@ public class HoodieCleanConfig extends HoodieConfig {
+ " in the timeline, since the last cleaner run. This is much more efficient than obtaining listings for the full"
+ " table for each planning (even with a metadata table).");

// Ethan: this should be an internal config, based on concurrency control mode. Should not be tweaked by user.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make this advanced and move on.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5893

public static final ConfigProperty<String> FAILED_WRITES_CLEANER_POLICY = ConfigProperty
.key("hoodie.cleaner.policy.failed.writes")
.defaultValue(HoodieFailedWritesCleaningPolicy.EAGER.name())
Expand All @@ -117,18 +127,21 @@ public class HoodieCleanConfig extends HoodieConfig {
+ "failed writes to re-claim space. Choose to perform this rollback of failed writes eagerly before "
+ "every writer starts (only supported for single writer) or lazily by the cleaner (required for multi-writers)");

// Ethan: same here. Can we reuse Spark parallelism or derive automatically?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5894

public static final ConfigProperty<String> CLEANER_PARALLELISM_VALUE = ConfigProperty
.key("hoodie.cleaner.parallelism")
.defaultValue("200")
.withDocumentation("Parallelism for the cleaning operation. Increase this if cleaning becomes slow.");

// Ethan: wondering if this is still needed.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nsivabalan do you think we still need this?

public static final ConfigProperty<Boolean> ALLOW_MULTIPLE_CLEANS = ConfigProperty
.key("hoodie.clean.allow.multiple")
.defaultValue(true)
.sinceVersion("0.11.0")
.withDocumentation("Allows scheduling/executing multiple cleans by enabling this config. If users prefer to strictly ensure clean requests should be mutually exclusive, "
+ ".i.e. a 2nd clean will not be scheduled if another clean is not yet completed to avoid repeat cleaning of same files, they might want to disable this config.");

// Ethan: should the default behavior be deleting original parquet files and then we can remove this config?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should ask Udit, why this is the default

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5895

public static final ConfigProperty<String> CLEANER_BOOTSTRAP_BASE_FILE_ENABLE = ConfigProperty
.key("hoodie.cleaner.delete.bootstrap.base.file")
.defaultValue("false")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
description = "Configurations that control the clustering table service in hudi, "
+ "which optimizes the storage layout for better query performance by sorting and sizing data files.")
public class HoodieClusteringConfig extends HoodieConfig {
// Configs for Clustering operations
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

revisit all docs here, and make it very user friendly

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5897


// Any strategy specific params can be saved with this prefix
public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy.";
Expand Down Expand Up @@ -90,6 +91,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
.withDocumentation("End partition used to filter partition (inclusive), only effective when the filter mode '"
+ PLAN_PARTITION_FILTER_MODE + "' is " + ClusteringPlanPartitionFilterMode.SELECTED_PARTITIONS.name());

// Ethan: Should this be aligned with default parquet base file size?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we could infer this from base file size.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would read the code though to understand if it has side effects i.e the Hudi will always leave 100MB files, so will clustering ever kick in? we need to trigger clustering based on col stats and how clustered the table is.

public static final ConfigProperty<String> PLAN_STRATEGY_SMALL_FILE_LIMIT = ConfigProperty
.key(CLUSTERING_STRATEGY_PARAM_PREFIX + "small.file.limit")
.defaultValue(String.valueOf(300 * 1024 * 1024L))
Expand All @@ -108,6 +110,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
.sinceVersion("0.11.0")
.withDocumentation("Partitions to run clustering");

// Ethan: enum instead of class name to determine strategy type? So this can be engine-agnostic
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5892

public static final ConfigProperty<String> PLAN_STRATEGY_CLASS_NAME = ConfigProperty
.key("hoodie.clustering.plan.strategy.class")
.defaultValue(SPARK_SIZED_BASED_CLUSTERING_PLAN_STRATEGY)
Expand All @@ -131,18 +134,21 @@ public class HoodieClusteringConfig extends HoodieConfig {
.withDocumentation("Turn on inline clustering - clustering will be run after each write operation is complete")
.withAlternatives("hoodie.datasource.clustering.inline.enable");

// Ethan: this can apply to both inline or aysnc clustering. Rename it for better readability.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5896

public static final ConfigProperty<String> INLINE_CLUSTERING_MAX_COMMITS = ConfigProperty
.key("hoodie.clustering.inline.max.commits")
.defaultValue("4")
.sinceVersion("0.7.0")
.withDocumentation("Config to control frequency of clustering planning");

// Ethan: should be merged with inline one
public static final ConfigProperty<String> ASYNC_CLUSTERING_MAX_COMMITS = ConfigProperty
.key("hoodie.clustering.async.max.commits")
.defaultValue("4")
.sinceVersion("0.9.0")
.withDocumentation("Config to control frequency of async clustering");

// Ethan: better docs or configs for daybased strategy (to read code and figure this out)
public static final ConfigProperty<String> PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST = ConfigProperty
.key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.skipfromlatest.partitions")
.defaultValue("0")
Expand All @@ -162,6 +168,9 @@ public class HoodieClusteringConfig extends HoodieConfig {
+ "DAY_ROLLING: clustering partitions on a rolling basis by the hour to avoid clustering all partitions each time, "
+ "which strategy sorts the partitions asc and chooses the partition of which index is divided by 24 and the remainder is equal to the current hour.");

// Ethan: all these fine-tuning knobs can be simplified if we have a config to indicate the purpose:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should simplify this. esp the default. IIRC we only cluster within the group? so picking group size is important?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

30 is kind of arbitrary.. We should rethink this fundamentally.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These groups were added to run clustering in smaller rounds, for large scale use-cases. We should make it simple for out of box medium scale data

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5892

// increase file size only, or sorting is also required. And we derive a better defaults for these
// without having to ask users to configure them
public static final ConfigProperty<String> PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP = ConfigProperty
.key(CLUSTERING_STRATEGY_PARAM_PREFIX + "max.bytes.per.group")
.defaultValue(String.valueOf(2 * 1024 * 1024 * 1024L))
Expand Down Expand Up @@ -201,6 +210,8 @@ public class HoodieClusteringConfig extends HoodieConfig {
.withDocumentation("Determines how to handle updates, deletes to file groups that are under clustering."
+ " Default strategy just rejects the update");

// Ethan: these configs of inline, async, schedule, execute can be merged into one mode config:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5896

// e.g., none, schedule_only, schedule_and_execute, etc.
public static final ConfigProperty<String> SCHEDULE_INLINE_CLUSTERING = ConfigProperty
.key("hoodie.clustering.schedule.inline")
.defaultValue("false")
Expand All @@ -218,6 +229,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
.withDocumentation("Enable running of clustering service, asynchronously as inserts happen on the table.")
.withAlternatives("hoodie.datasource.clustering.async.enable");

// Ethan: this should be removed. User should not tweak this to corrupt the hoodie_commit_time meta field.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

user should not have control. we should preserve commit_time for internal write operations (compaction, clustering.. ) . Think about any historical debt, before making final call.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracked in HUDI-5782

public static final ConfigProperty<Boolean> PRESERVE_COMMIT_METADATA = ConfigProperty
.key("hoodie.clustering.preserve.commit.metadata")
.defaultValue(true)
Expand Down
Loading