-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-5194] Fix schema files cleaning by FileBasedInternalSchemaStorageManager #7183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,7 @@ | |
| import org.apache.hudi.common.model.HoodieRecordPayload; | ||
| import org.apache.hudi.common.model.HoodieWriteStat; | ||
| import org.apache.hudi.common.model.WriteOperationType; | ||
| import org.apache.hudi.common.table.TableSchemaResolver; | ||
| import org.apache.hudi.common.table.timeline.HoodieTimeline; | ||
| import org.apache.hudi.common.util.CompactionUtils; | ||
| import org.apache.hudi.common.util.Option; | ||
|
|
@@ -85,9 +86,12 @@ public HoodieWriteMetadata<HoodieData<WriteStatus>> execute() { | |
|
|
||
| // try to load internalSchema to support schema Evolution | ||
| HoodieWriteConfig configCopy = config; | ||
| Pair<Option<String>, Option<String>> schemaPair = InternalSchemaCache | ||
| .getInternalSchemaAndAvroSchemaForClusteringAndCompaction(table.getMetaClient(), instantTime); | ||
| if (schemaPair.getLeft().isPresent() && schemaPair.getRight().isPresent()) { | ||
| boolean schemaEvolutionEnable = new TableSchemaResolver(table.getMetaClient()).getTableInternalSchemaFromCommitMetadata().isPresent(); | ||
| Pair<Option<String>, Option<String>> schemaPair = Pair.of(Option.empty(), Option.empty()); | ||
| if (schemaEvolutionEnable) { | ||
| schemaPair = InternalSchemaCache.getInternalSchemaAndAvroSchemaForClusteringAndCompaction(table.getMetaClient(), instantTime); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Optimize the code, trigger the corresponding logic only when the schema evolution is enabled |
||
| } | ||
| if (schemaEvolutionEnable && schemaPair.getLeft().isPresent() && schemaPair.getRight().isPresent()) { | ||
| // should not influence the original config, just copy it | ||
| configCopy = HoodieWriteConfig.newBuilder().withProperties(config.getProps()).build(); | ||
| configCopy.setInternalSchemaString(schemaPair.getLeft().get()); | ||
|
|
@@ -105,7 +109,7 @@ public HoodieWriteMetadata<HoodieData<WriteStatus>> execute() { | |
| metadata.addWriteStat(stat.getPartitionPath(), stat); | ||
| } | ||
| metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, config.getSchema()); | ||
| if (schemaPair.getLeft().isPresent()) { | ||
| if (schemaEvolutionEnable) { | ||
| metadata.addMetadata(SerDeHelper.LATEST_SCHEMA, schemaPair.getLeft().get()); | ||
| metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, schemaPair.getRight().get()); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -52,6 +52,9 @@ | |
| import org.apache.avro.generic.IndexedRecord; | ||
| import org.apache.hadoop.fs.FileSystem; | ||
| import org.apache.hadoop.fs.Path; | ||
| import org.apache.hudi.internal.schema.io.FileBasedInternalSchemaStorageManager; | ||
| import org.apache.hudi.internal.schema.utils.InternalSchemaUtils; | ||
| import org.apache.hudi.internal.schema.utils.SerDeHelper; | ||
| import org.apache.log4j.LogManager; | ||
| import org.apache.log4j.Logger; | ||
|
|
||
|
|
@@ -67,6 +70,7 @@ | |
| import java.util.Map; | ||
| import java.util.Properties; | ||
| import java.util.Set; | ||
| import java.util.TreeMap; | ||
| import java.util.concurrent.atomic.AtomicLong; | ||
| import java.util.function.Function; | ||
| import java.util.stream.Collectors; | ||
|
|
@@ -127,6 +131,8 @@ public abstract class AbstractHoodieLogRecordReader { | |
| private AtomicLong totalLogFiles = new AtomicLong(0); | ||
| // Internal schema, used to support full schema evolution. | ||
| private InternalSchema internalSchema; | ||
| // Historical Schemas, only used when schema evolution enabled. | ||
| private TreeMap<Long, InternalSchema> historicalSchemas; | ||
| // Hoodie table path. | ||
| private final String path; | ||
| // Total log blocks read - for metrics | ||
|
|
@@ -810,13 +816,23 @@ private Option<Function<IndexedRecord, IndexedRecord>> composeEvolvedSchemaTrans | |
| } | ||
|
|
||
| long currentInstantTime = Long.parseLong(dataBlock.getLogBlockHeader().get(INSTANT_TIME)); | ||
| InternalSchema fileSchema = InternalSchemaCache.searchSchemaAndCache(currentInstantTime, | ||
| hoodieTableMetaClient, false); | ||
| if (historicalSchemas == null) { | ||
| FileBasedInternalSchemaStorageManager schemaStorageManager = new FileBasedInternalSchemaStorageManager(hoodieTableMetaClient); | ||
| historicalSchemas = SerDeHelper.parseSchemas(schemaStorageManager.getHistorySchemaStr()); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cache historical schema,reduce the overhead of search fileSchema. |
||
| } | ||
| InternalSchema fileSchema; | ||
| long maxVersionId = historicalSchemas.keySet().stream().max(Long::compareTo).orElse(0L); | ||
| if (maxVersionId >= currentInstantTime) { | ||
| fileSchema = InternalSchemaUtils.searchSchema(currentInstantTime, historicalSchemas); | ||
| } else { | ||
| fileSchema = InternalSchemaCache.getInternalSchemaByVersionId(currentInstantTime, hoodieTableMetaClient); | ||
| historicalSchemas.put(currentInstantTime, fileSchema); | ||
| } | ||
| InternalSchema mergedInternalSchema = new InternalSchemaMerger(fileSchema, internalSchema, | ||
| true, false).mergeSchema(); | ||
| Schema mergedAvroSchema = AvroInternalSchemaConverter.convert(mergedInternalSchema, readerSchema.getFullName()); | ||
|
|
||
| return Option.of((record) -> rewriteRecordWithNewSchema(record, mergedAvroSchema, Collections.emptyMap())); | ||
| return Option.of((record) -> rewriteRecordWithNewSchema(record, mergedAvroSchema, InternalSchemaUtils.collectRenameCols(fileSchema, internalSchema))); | ||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove useless check