-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-2560][RFC-33] Support full Schema evolution for Spark #4910
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ced7477
13fad93
3df69e7
64b5492
0b468d1
c34fdf8
31a79d5
926c0be
b1aa45e
0edac35
f7dde32
d808254
057e8f0
58cd3a6
8951618
7a0dc9e
eee91e5
3cde85a
d8fa679
6828189
fc707d4
011e886
dc3b29a
3c9bfc9
71fcd14
c7c98b0
2415e5c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.hudi.async.AsyncArchiveService; | ||
| import org.apache.hudi.async.AsyncCleanerService; | ||
| import org.apache.hudi.avro.HoodieAvroUtils; | ||
| import org.apache.hudi.avro.model.HoodieCleanMetadata; | ||
| import org.apache.hudi.avro.model.HoodieCleanerPlan; | ||
| import org.apache.hudi.avro.model.HoodieClusteringPlan; | ||
|
|
@@ -49,6 +50,7 @@ | |
| import org.apache.hudi.common.model.WriteOperationType; | ||
| import org.apache.hudi.common.table.HoodieTableMetaClient; | ||
| import org.apache.hudi.common.table.HoodieTableVersion; | ||
| import org.apache.hudi.common.table.TableSchemaResolver; | ||
| import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; | ||
| import org.apache.hudi.common.table.timeline.HoodieInstant; | ||
| import org.apache.hudi.common.table.timeline.HoodieInstant.State; | ||
|
|
@@ -71,6 +73,15 @@ | |
| import org.apache.hudi.exception.HoodieSavepointException; | ||
| import org.apache.hudi.index.HoodieIndex; | ||
| import org.apache.hudi.metadata.HoodieTableMetadata; | ||
| import org.apache.hudi.internal.schema.InternalSchema; | ||
| import org.apache.hudi.internal.schema.Type; | ||
| import org.apache.hudi.internal.schema.action.InternalSchemaChangeApplier; | ||
| import org.apache.hudi.internal.schema.action.TableChange; | ||
| import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter; | ||
| import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager; | ||
| import org.apache.hudi.internal.schema.utils.AvroSchemaEvolutionUtils; | ||
| import org.apache.hudi.internal.schema.utils.InternalSchemaUtils; | ||
| import org.apache.hudi.internal.schema.utils.SerDeHelper; | ||
| import org.apache.hudi.metadata.HoodieTableMetadataWriter; | ||
| import org.apache.hudi.metadata.MetadataPartitionType; | ||
| import org.apache.hudi.metrics.HoodieMetrics; | ||
|
|
@@ -85,6 +96,7 @@ | |
|
|
||
| import com.codahale.metrics.Timer; | ||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.avro.Schema; | ||
| import org.apache.log4j.LogManager; | ||
| import org.apache.log4j.Logger; | ||
|
|
||
|
|
@@ -101,6 +113,8 @@ | |
| import java.util.stream.Collectors; | ||
| import java.util.stream.Stream; | ||
|
|
||
| import static org.apache.hudi.common.model.HoodieCommitMetadata.SCHEMA_KEY; | ||
|
|
||
| /** | ||
| * Abstract Write Client providing functionality for performing commit, index updates and rollback | ||
| * Reused for regular write operations like upsert/insert/bulk-insert.. as well as bootstrap | ||
|
|
@@ -246,12 +260,42 @@ protected void commit(HoodieTable table, String commitActionType, String instant | |
| HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); | ||
| // Finalize write | ||
| finalizeWrite(table, instantTime, stats); | ||
| // do save internal schema to support Implicitly add columns in write process | ||
| if (!metadata.getExtraMetadata().containsKey(SerDeHelper.LATEST_SCHEMA) | ||
| && metadata.getExtraMetadata().containsKey(SCHEMA_KEY) && table.getConfig().getSchemaEvolutionEnable()) { | ||
| saveInternalSchema(table, instantTime, metadata); | ||
| } | ||
| // update Metadata table | ||
| writeTableMetadata(table, instantTime, commitActionType, metadata); | ||
| activeTimeline.saveAsComplete(new HoodieInstant(true, commitActionType, instantTime), | ||
| Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); | ||
| } | ||
|
|
||
| // Save internal schema | ||
| private void saveInternalSchema(HoodieTable table, String instantTime, HoodieCommitMetadata metadata) { | ||
| TableSchemaResolver schemaUtil = new TableSchemaResolver(table.getMetaClient()); | ||
| String historySchemaStr = schemaUtil.getTableHistorySchemaStrFromCommitMetadata().orElse(""); | ||
| FileBasedInternalSchemaStorageManager schemasManager = new FileBasedInternalSchemaStorageManager(table.getMetaClient()); | ||
| if (!historySchemaStr.isEmpty()) { | ||
| InternalSchema internalSchema = InternalSchemaUtils.searchSchema(Long.parseLong(instantTime), | ||
| SerDeHelper.parseSchemas(historySchemaStr)); | ||
| Schema avroSchema = HoodieAvroUtils.createHoodieWriteSchema(new Schema.Parser().parse(config.getSchema())); | ||
| InternalSchema evolvedSchema = AvroSchemaEvolutionUtils.evolveSchemaFromNewAvroSchema(avroSchema, internalSchema); | ||
| if (evolvedSchema.equals(internalSchema)) { | ||
| metadata.addMetadata(SerDeHelper.LATEST_SCHEMA, SerDeHelper.toJson(evolvedSchema)); | ||
| //TODO save history schema by metaTable | ||
| schemasManager.persistHistorySchemaStr(instantTime, historySchemaStr); | ||
| } else { | ||
| evolvedSchema.setSchemaId(Long.parseLong(instantTime)); | ||
| String newSchemaStr = SerDeHelper.toJson(evolvedSchema); | ||
| metadata.addMetadata(SerDeHelper.LATEST_SCHEMA, newSchemaStr); | ||
| schemasManager.persistHistorySchemaStr(instantTime, SerDeHelper.inheritSchemas(evolvedSchema, historySchemaStr)); | ||
| } | ||
| // update SCHEMA_KEY | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For what concern we start a separate timeline for schemas, is there possibility we reuse the existing meta files for the internal schema ? And do we have plan to replace the avro schema with internal schema in the future ? The Avro schema can not handle data types like
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. answer
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So what is the relationship with the DDL schema change and the schema change on write ? For schema change on write, we already reuse the schema in the instant metadata file, we should elaborate more to have uniform abstraction for these two cases.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi danney Do you want to ask line 292 why we use a new timeline to save history schema?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With this patch, we have avro schema for metadata file and an separate internal schema for DDL operations, and the avro schema can also handle the shema change on write, these abstraction is not that clear and we need to elaborate more with the behaviors.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess we probably need a blog or proper documentation describing the changes on a high level. WDYT @xiarixiaoyao ? |
||
| metadata.addMetadata(SCHEMA_KEY, AvroInternalSchemaConverter.convert(evolvedSchema, avroSchema.getName()).toString()); | ||
| } | ||
| } | ||
|
|
||
| protected HoodieTable createTable(HoodieWriteConfig config, Configuration hadoopConf) { | ||
| return createTable(config, hadoopConf, false); | ||
| } | ||
|
|
@@ -1442,8 +1486,8 @@ protected void setWriteSchemaForDeletes(HoodieTableMetaClient metaClient) { | |
| if (lastInstant.isPresent()) { | ||
| HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( | ||
| activeTimeline.getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class); | ||
| if (commitMetadata.getExtraMetadata().containsKey(HoodieCommitMetadata.SCHEMA_KEY)) { | ||
| config.setSchema(commitMetadata.getExtraMetadata().get(HoodieCommitMetadata.SCHEMA_KEY)); | ||
| if (commitMetadata.getExtraMetadata().containsKey(SCHEMA_KEY)) { | ||
| config.setSchema(commitMetadata.getExtraMetadata().get(SCHEMA_KEY)); | ||
| } else { | ||
| throw new HoodieIOException("Latest commit does not have any schema in commit metadata"); | ||
| } | ||
|
|
@@ -1505,4 +1549,138 @@ private void tryUpgrade(HoodieTableMetaClient metaClient, Option<String> instant | |
| metaClient.reloadActiveTimeline(); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * add columns to table. | ||
| * | ||
| * @param colName col name to be added. if we want to add col to a nested filed, the fullName should be specify | ||
| * @param schema col type to be added. | ||
| * @param doc col doc to be added. | ||
| * @param position col position to be added | ||
| * @param positionType col position change type. now support three change types: first/after/before | ||
| */ | ||
| public void addColumn(String colName, Schema schema, String doc, String position, TableChange.ColumnPositionChange.ColumnPositionType positionType) { | ||
| Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient(); | ||
| InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()) | ||
| .applyAddChange(colName, AvroInternalSchemaConverter.convertToField(schema), doc, position, positionType); | ||
| commitTableChange(newSchema, pair.getRight()); | ||
| } | ||
|
|
||
| public void addColumn(String colName, Schema schema) { | ||
| addColumn(colName, schema, null, "", TableChange.ColumnPositionChange.ColumnPositionType.NO_OPERATION); | ||
| } | ||
|
|
||
| /** | ||
| * delete columns to table. | ||
| * | ||
| * @param colNames col name to be deleted. if we want to delete col from a nested filed, the fullName should be specify | ||
| */ | ||
| public void deleteColumns(String... colNames) { | ||
| Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient(); | ||
| InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyDeleteChange(colNames); | ||
| commitTableChange(newSchema, pair.getRight()); | ||
| } | ||
|
|
||
| /** | ||
| * rename col name for hudi table. | ||
| * | ||
| * @param colName col name to be renamed. if we want to rename col from a nested filed, the fullName should be specify | ||
| * @param newName new name for current col. no need to specify fullName. | ||
| */ | ||
| public void renameColumn(String colName, String newName) { | ||
| Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient(); | ||
| InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyRenameChange(colName, newName); | ||
| commitTableChange(newSchema, pair.getRight()); | ||
| } | ||
|
|
||
| /** | ||
| * update col nullable attribute for hudi table. | ||
| * | ||
| * @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify | ||
| * @param nullable . | ||
| */ | ||
| public void updateColumnNullability(String colName, boolean nullable) { | ||
| Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient(); | ||
| InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyColumnNullabilityChange(colName, nullable); | ||
| commitTableChange(newSchema, pair.getRight()); | ||
| } | ||
|
|
||
| /** | ||
| * update col Type for hudi table. | ||
| * only support update primitive type to primitive type. | ||
| * cannot update nest type to nest type or primitive type eg: RecordType -> MapType, MapType -> LongType. | ||
| * | ||
| * @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify | ||
| * @param newType . | ||
| */ | ||
| public void updateColumnType(String colName, Type newType) { | ||
| Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient(); | ||
| InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyColumnTypeChange(colName, newType); | ||
| commitTableChange(newSchema, pair.getRight()); | ||
| } | ||
|
|
||
| /** | ||
| * update col comment for hudi table. | ||
| * | ||
| * @param colName col name to be changed. if we want to change col from a nested filed, the fullName should be specify | ||
| * @param doc . | ||
| */ | ||
| public void updateColumnComment(String colName, String doc) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where are all these functions getting used? I do not see any caller for these @xiarixiaoyao
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are the exposed API interfaces. |
||
| Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient(); | ||
| InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()).applyColumnCommentChange(colName, doc); | ||
| commitTableChange(newSchema, pair.getRight()); | ||
| } | ||
|
|
||
| /** | ||
| * reorder the position of col. | ||
| * | ||
| * @param colName column which need to be reordered. if we want to change col from a nested filed, the fullName should be specify. | ||
| * @param referColName reference position. | ||
| * @param orderType col position change type. now support three change types: first/after/before | ||
| */ | ||
| public void reOrderColPosition(String colName, String referColName, TableChange.ColumnPositionChange.ColumnPositionType orderType) { | ||
| if (colName == null || orderType == null || referColName == null) { | ||
| return; | ||
| } | ||
| //get internalSchema | ||
| Pair<InternalSchema, HoodieTableMetaClient> pair = getInternalSchemaAndMetaClient(); | ||
| InternalSchema newSchema = new InternalSchemaChangeApplier(pair.getLeft()) | ||
| .applyReOrderColPositionChange(colName, referColName, orderType); | ||
| commitTableChange(newSchema, pair.getRight()); | ||
| } | ||
|
|
||
| private Pair<InternalSchema, HoodieTableMetaClient> getInternalSchemaAndMetaClient() { | ||
| HoodieTableMetaClient metaClient = createMetaClient(true); | ||
| TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); | ||
| Option<InternalSchema> internalSchemaOption = schemaUtil.getTableInternalSchemaFromCommitMetadata(); | ||
| if (!internalSchemaOption.isPresent()) { | ||
| throw new HoodieException(String.format("cannot find schema for current table: %s", config.getBasePath())); | ||
| } | ||
| return Pair.of(internalSchemaOption.get(), metaClient); | ||
| } | ||
|
|
||
| private void commitTableChange(InternalSchema newSchema, HoodieTableMetaClient metaClient) { | ||
| TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); | ||
| String historySchemaStr = schemaUtil.getTableHistorySchemaStrFromCommitMetadata().orElse(""); | ||
| Schema schema = AvroInternalSchemaConverter.convert(newSchema, config.getTableName()); | ||
| String commitActionType = CommitUtils.getCommitActionType(WriteOperationType.ALTER_SCHEMA, metaClient.getTableType()); | ||
| String instantTime = HoodieActiveTimeline.createNewInstantTime(); | ||
| startCommitWithTime(instantTime, commitActionType, metaClient); | ||
| config.setSchema(schema.toString()); | ||
| HoodieActiveTimeline timeLine = metaClient.getActiveTimeline(); | ||
| HoodieInstant requested = new HoodieInstant(State.REQUESTED, commitActionType, instantTime); | ||
| HoodieCommitMetadata metadata = new HoodieCommitMetadata(); | ||
| metadata.setOperationType(WriteOperationType.ALTER_SCHEMA); | ||
| try { | ||
| timeLine.transitionRequestedToInflight(requested, Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); | ||
| } catch (IOException io) { | ||
| throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", io); | ||
| } | ||
| Map<String, String> extraMeta = new HashMap<>(); | ||
| extraMeta.put(SerDeHelper.LATEST_SCHEMA, SerDeHelper.toJson(newSchema.setSchemaId(Long.getLong(instantTime)))); | ||
| // try to save history schemas | ||
| FileBasedInternalSchemaStorageManager schemasManager = new FileBasedInternalSchemaStorageManager(metaClient); | ||
| schemasManager.persistHistorySchemaStr(instantTime, SerDeHelper.inheritSchemas(newSchema, historySchemaStr)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the purpose of storing history schema here, I guess this is redundant since we are anyways storing the evolved schema as history schema in saveInternalSchema() method which gets called from commitStats(). WDYT @xiarixiaoyao ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. of course, already ping you in slack |
||
| commitStats(instantTime, Collections.EMPTY_LIST, Option.of(extraMeta), commitActionType); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -167,6 +167,22 @@ public class HoodieWriteConfig extends HoodieConfig { | |
| + "implementations of HoodieRecordPayload to convert incoming records to avro. This is also used as the write schema " | ||
| + "evolving records during an update."); | ||
|
|
||
| public static final ConfigProperty<String> INTERNAL_SCHEMA_STRING = ConfigProperty | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will revisit these docs. |
||
| .key("hoodie.internal.schema") | ||
| .noDefaultValue() | ||
| .withDocumentation("Schema string representing the latest schema of the table. Hudi passes this to " | ||
| + "implementations of evolution of schema"); | ||
|
|
||
| public static final ConfigProperty<Boolean> SCHEMA_EVOLUTION_ENABLE = ConfigProperty | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This almost seems it like Hudi does not evolve schemas today. Could we call this
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no problems, let me modfy it
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @xiarixiaoyao : It is better to prefix with hoodie in the config. So, hoodie.schema.on.read.enable
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
| .key("hoodie.schema.on.read.enable") | ||
| .defaultValue(false) | ||
| .withDocumentation("enable full schema evolution for hoodie"); | ||
|
|
||
| public static final ConfigProperty<Boolean> ENABLE_INTERNAL_SCHEMA_CACHE = ConfigProperty | ||
| .key("hoodie.schema.cache.enable") | ||
| .defaultValue(false) | ||
| .withDocumentation("cache query internalSchemas in driver/executor side"); | ||
|
|
||
| public static final ConfigProperty<String> AVRO_SCHEMA_VALIDATE_ENABLE = ConfigProperty | ||
| .key("hoodie.avro.schema.validate") | ||
| .defaultValue("false") | ||
|
|
@@ -886,6 +902,30 @@ public void setSchema(String schemaStr) { | |
| setValue(AVRO_SCHEMA_STRING, schemaStr); | ||
| } | ||
|
|
||
| public String getInternalSchema() { | ||
| return getString(INTERNAL_SCHEMA_STRING); | ||
| } | ||
|
|
||
| public boolean getInternalSchemaCacheEnable() { | ||
| return getBoolean(ENABLE_INTERNAL_SCHEMA_CACHE); | ||
| } | ||
|
|
||
| public void setInternalSchemaString(String internalSchemaString) { | ||
| setValue(INTERNAL_SCHEMA_STRING, internalSchemaString); | ||
| } | ||
|
|
||
| public void setInternalSchemaCacheEnable(boolean enable) { | ||
| setValue(ENABLE_INTERNAL_SCHEMA_CACHE, String.valueOf(enable)); | ||
| } | ||
|
|
||
| public boolean getSchemaEvolutionEnable() { | ||
| return getBoolean(SCHEMA_EVOLUTION_ENABLE); | ||
| } | ||
|
|
||
| public void setSchemaEvolutionEnable(boolean enable) { | ||
| setValue(SCHEMA_EVOLUTION_ENABLE, String.valueOf(enable)); | ||
| } | ||
|
|
||
| /** | ||
| * Get the write schema for written records. | ||
| * | ||
|
|
@@ -2075,6 +2115,16 @@ public Builder withSchema(String schemaStr) { | |
| return this; | ||
| } | ||
|
|
||
| public Builder withSchemaEvolutionEnable(boolean enable) { | ||
| writeConfig.setValue(SCHEMA_EVOLUTION_ENABLE, String.valueOf(enable)); | ||
| return this; | ||
| } | ||
|
|
||
| public Builder withInternalSchemaCacheEnable(boolean enable) { | ||
| writeConfig.setValue(ENABLE_INTERNAL_SCHEMA_CACHE, String.valueOf(enable)); | ||
| return this; | ||
| } | ||
|
|
||
| public Builder withAvroSchemaValidate(boolean enable) { | ||
| writeConfig.setValue(AVRO_SCHEMA_VALIDATE_ENABLE, String.valueOf(enable)); | ||
| return this; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull into a Separate method?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agree
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed