-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-5503] Optimize flink table factory option check #7608
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,6 +56,8 @@ public class RowDataKeyGen implements Serializable { | |
|
|
||
| private static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/"; | ||
|
|
||
| private final boolean hasRecordKey; | ||
|
|
||
| private final String[] recordKeyFields; | ||
| private final String[] partitionPathFields; | ||
|
|
||
|
|
@@ -90,7 +92,11 @@ private RowDataKeyGen( | |
|
|
||
| this.hiveStylePartitioning = hiveStylePartitioning; | ||
| this.encodePartitionPath = encodePartitionPath; | ||
| if (this.recordKeyFields.length == 1) { | ||
|
|
||
| this.hasRecordKey = hasRecordKey(fieldNames); | ||
| if (!hasRecordKey) { | ||
| this.recordKeyProjection = null; | ||
| } else if (this.recordKeyFields.length == 1) { | ||
| // efficient code path | ||
| this.simpleRecordKey = true; | ||
| int recordKeyIdx = fieldNames.indexOf(this.recordKeyFields[0]); | ||
|
|
@@ -115,6 +121,14 @@ private RowDataKeyGen( | |
| this.keyGenOpt = keyGenOpt; | ||
| } | ||
|
|
||
| /** | ||
| * Checks whether user provides any record key. | ||
| */ | ||
| private boolean hasRecordKey(List<String> fieldNames) { | ||
| return recordKeyFields.length != 1 | ||
| || fieldNames.contains(recordKeyFields[0]); | ||
| } | ||
|
|
||
| public static RowDataKeyGen instance(Configuration conf, RowType rowType) { | ||
| Option<TimestampBasedAvroKeyGenerator> keyGeneratorOpt = Option.empty(); | ||
| if (TimestampBasedAvroKeyGenerator.class.getName().equals(conf.getString(FlinkOptions.KEYGEN_CLASS_NAME))) { | ||
|
|
@@ -134,7 +148,11 @@ public HoodieKey getHoodieKey(RowData rowData) { | |
| } | ||
|
|
||
| public String getRecordKey(RowData rowData) { | ||
| if (this.simpleRecordKey) { | ||
| if (!hasRecordKey) { | ||
| // should be optimized to unique values that can be easily calculated with low cost | ||
| // for e.g, fileId + auto inc integer | ||
| return EMPTY_RECORDKEY_PLACEHOLDER; | ||
| } else if (this.simpleRecordKey) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure whether we should use the empty string for the pk-less scenario, because all the records would have the same value primary key, which breaks the pk-less semantics, for pk-less, we actually mean all the records are unique, there is no need to define the primary key. Another solution is to use the UUID as the primary key, WDYT ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if remove the pk field will cause error somewhere, and write a identical value should use very low storage in columnar file format like parquet, and UUID will use much more space since its uniq so cannot compress well, and i don't know where we can use uuid, so i think maybe store a identical value for pk is better. I change default key value to RowDataKeyGen.EMPTY_RECORDKEY_PLACEHOLDER since empty row key will report error.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In #7622, empty string is also used for keyless primary keys, so it's okey here if we reach an agreement and never uses the primary key. |
||
| return getRecordKey(recordKeyFieldGetter.getFieldOrNull(rowData), this.recordKeyFields[0]); | ||
| } else { | ||
| Object[] keyValues = this.recordKeyProjection.projectAsValues(rowData); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,8 +19,10 @@ | |
| package org.apache.hudi.table; | ||
|
|
||
| import org.apache.hudi.common.model.DefaultHoodieRecordPayload; | ||
| import org.apache.hudi.common.table.HoodieTableConfig; | ||
| import org.apache.hudi.common.util.StringUtils; | ||
| import org.apache.hudi.configuration.FlinkOptions; | ||
| import org.apache.hudi.configuration.HadoopConfigurations; | ||
| import org.apache.hudi.configuration.OptionsResolver; | ||
| import org.apache.hudi.exception.HoodieValidationException; | ||
| import org.apache.hudi.index.HoodieIndex; | ||
|
|
@@ -30,6 +32,7 @@ | |
| import org.apache.hudi.keygen.constant.KeyGeneratorOptions; | ||
| import org.apache.hudi.util.AvroSchemaConverter; | ||
| import org.apache.hudi.util.DataTypeUtils; | ||
| import org.apache.hudi.util.StreamerUtil; | ||
|
|
||
| import org.apache.flink.configuration.ConfigOption; | ||
| import org.apache.flink.configuration.Configuration; | ||
|
|
@@ -68,12 +71,11 @@ public class HoodieTableFactory implements DynamicTableSourceFactory, DynamicTab | |
| @Override | ||
| public DynamicTableSource createDynamicTableSource(Context context) { | ||
| Configuration conf = FlinkOptions.fromMap(context.getCatalogTable().getOptions()); | ||
| ResolvedSchema schema = context.getCatalogTable().getResolvedSchema(); | ||
| sanityCheck(conf, schema); | ||
| setupConfOptions(conf, context.getObjectIdentifier(), context.getCatalogTable(), schema); | ||
|
|
||
| Path path = new Path(conf.getOptional(FlinkOptions.PATH).orElseThrow(() -> | ||
| new ValidationException("Option [path] should not be empty."))); | ||
| setupTableOptions(conf.getString(FlinkOptions.PATH), conf); | ||
| ResolvedSchema schema = context.getCatalogTable().getResolvedSchema(); | ||
| setupConfOptions(conf, context.getObjectIdentifier(), context.getCatalogTable(), schema); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The sanity check for source can be removed. When the primary key definition is missing, the streaming source for MOR table would distinguish the case as pk-less, so no deletes are emitted. |
||
| return new HoodieTableSource( | ||
| schema, | ||
| path, | ||
|
|
@@ -87,12 +89,34 @@ public DynamicTableSink createDynamicTableSink(Context context) { | |
| Configuration conf = FlinkOptions.fromMap(context.getCatalogTable().getOptions()); | ||
| checkArgument(!StringUtils.isNullOrEmpty(conf.getString(FlinkOptions.PATH)), | ||
| "Option [path] should not be empty."); | ||
| setupTableOptions(conf.getString(FlinkOptions.PATH), conf); | ||
| ResolvedSchema schema = context.getCatalogTable().getResolvedSchema(); | ||
| sanityCheck(conf, schema); | ||
| setupConfOptions(conf, context.getObjectIdentifier(), context.getCatalogTable(), schema); | ||
| return new HoodieTableSink(conf, schema); | ||
| } | ||
|
|
||
| /** | ||
| * Supplement the table config options if not specified. | ||
| */ | ||
| private void setupTableOptions(String basePath, Configuration conf) { | ||
| StreamerUtil.getTableConfig(basePath, HadoopConfigurations.getHadoopConf(conf)) | ||
| .ifPresent(tableConfig -> { | ||
| if (tableConfig.contains(HoodieTableConfig.RECORDKEY_FIELDS) | ||
| && !conf.contains(FlinkOptions.RECORD_KEY_FIELD)) { | ||
| conf.setString(FlinkOptions.RECORD_KEY_FIELD, tableConfig.getString(HoodieTableConfig.RECORDKEY_FIELDS)); | ||
| } | ||
| if (tableConfig.contains(HoodieTableConfig.PRECOMBINE_FIELD) | ||
| && !conf.contains(FlinkOptions.PRECOMBINE_FIELD)) { | ||
| conf.setString(FlinkOptions.PRECOMBINE_FIELD, tableConfig.getString(HoodieTableConfig.PRECOMBINE_FIELD)); | ||
| } | ||
| if (tableConfig.contains(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE) | ||
| && !conf.contains(FlinkOptions.HIVE_STYLE_PARTITIONING)) { | ||
| conf.setBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING, tableConfig.getBoolean(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE)); | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| @Override | ||
| public String factoryIdentifier() { | ||
| return FACTORY_ID; | ||
|
|
@@ -119,9 +143,17 @@ public Set<ConfigOption<?>> optionalOptions() { | |
| * @param schema The table schema | ||
| */ | ||
| private void sanityCheck(Configuration conf, ResolvedSchema schema) { | ||
| List<String> fields = schema.getColumnNames(); | ||
| if (!OptionsResolver.isAppendMode(conf)) { | ||
| checkRecordKey(conf, schema); | ||
| checkPreCombineKey(conf, schema); | ||
| } | ||
| } | ||
|
|
||
| // validate record key in pk absence. | ||
| /** | ||
| * Validate the record key. | ||
| */ | ||
| private void checkRecordKey(Configuration conf, ResolvedSchema schema) { | ||
| List<String> fields = schema.getColumnNames(); | ||
| if (!schema.getPrimaryKey().isPresent()) { | ||
| String[] recordKeys = conf.get(FlinkOptions.RECORD_KEY_FIELD).split(","); | ||
| if (recordKeys.length == 1 | ||
|
|
@@ -139,8 +171,13 @@ private void sanityCheck(Configuration conf, ResolvedSchema schema) { | |
| + "'" + FlinkOptions.RECORD_KEY_FIELD.key() + "' does not exist in the table schema."); | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| // validate pre_combine key | ||
| /** | ||
| * Validate pre_combine key. | ||
| */ | ||
| private void checkPreCombineKey(Configuration conf, ResolvedSchema schema) { | ||
| List<String> fields = schema.getColumnNames(); | ||
| String preCombineField = conf.get(FlinkOptions.PRECOMBINE_FIELD); | ||
| if (!fields.contains(preCombineField)) { | ||
| if (OptionsResolver.isDefaultHoodieRecordPayloadClazz(conf)) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure whether we should use the empty string for the pk-less scenario, because all the records would have the same value primary key, which breaks the pk-less semantics, for pk-less, we actually mean all the records are unique, there is no need to define the primary key.
Another solution is to use the UUID as the primary key, WDYT ?