-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-6055] Fix input format for bootstrap tables #8397
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,7 +58,7 @@ public HiveHoodieTableFileIndex(HoodieEngineContext engineContext, | |
| shouldIncludePendingCommits, | ||
| true, | ||
| new NoopCache(), | ||
| false); | ||
| true); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now I remember we need to fix the lazy listing for Hvie File Index. Should this be in a separate PR?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this should not have been part of this PR. Actually, it doesn't really matter for Hudi connector as it doesn't go through COW input format code. And for Hive connector, we already saw that it was partition loader will instantiate this every call. However, the actual perf issue for Hive connector was fixed due to #7527 (comment) |
||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -58,7 +58,6 @@ | |||
| import org.apache.hadoop.mapred.JobConf; | ||||
| import org.apache.log4j.LogManager; | ||||
| import org.apache.log4j.Logger; | ||||
| import java.io.IOException; | ||||
|
|
||||
| import java.util.ArrayList; | ||||
| import java.util.Arrays; | ||||
|
|
@@ -82,32 +81,25 @@ public class SchemaEvolutionContext { | |||
|
|
||||
| private final InputSplit split; | ||||
| private final JobConf job; | ||||
| private HoodieTableMetaClient metaClient; | ||||
| private final HoodieTableMetaClient metaClient; | ||||
| public Option<InternalSchema> internalSchemaOption; | ||||
|
|
||||
| public SchemaEvolutionContext(InputSplit split, JobConf job) throws IOException { | ||||
| public SchemaEvolutionContext(InputSplit split, JobConf job) { | ||||
| this(split, job, Option.empty()); | ||||
| } | ||||
|
|
||||
| public SchemaEvolutionContext(InputSplit split, JobConf job, Option<HoodieTableMetaClient> metaClientOption) throws IOException { | ||||
| public SchemaEvolutionContext(InputSplit split, JobConf job, Option<HoodieTableMetaClient> metaClientOption) { | ||||
| this.split = split; | ||||
| this.job = job; | ||||
| this.metaClient = metaClientOption.isPresent() ? metaClientOption.get() : setUpHoodieTableMetaClient(); | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. now the init of internalSchemaOption has been removed. hudi/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java Line 91 in 83d4fe1
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh ok then there is no need of this PR. |
||||
| if (this.metaClient == null) { | ||||
| internalSchemaOption = Option.empty(); | ||||
| return; | ||||
| } | ||||
| try { | ||||
| TableSchemaResolver schemaUtil = new TableSchemaResolver(metaClient); | ||||
| this.internalSchemaOption = schemaUtil.getTableInternalSchemaFromCommitMetadata(); | ||||
| } catch (Exception e) { | ||||
| internalSchemaOption = Option.empty(); | ||||
| LOG.warn(String.format("failed to get internal Schema from hudi table:%s", metaClient.getBasePathV2()), e); | ||||
| } | ||||
| LOG.info(String.format("finish init schema evolution for split: %s", split)); | ||||
| } | ||||
|
|
||||
| private HoodieTableMetaClient setUpHoodieTableMetaClient() throws IOException { | ||||
| private HoodieTableMetaClient setUpHoodieTableMetaClient() { | ||||
| try { | ||||
| Path inputPath = ((FileSplit)split).getPath(); | ||||
| FileSystem fs = inputPath.getFileSystem(job); | ||||
|
|
@@ -159,27 +151,26 @@ public void doEvolutionForRealtimeInputFormat(AbstractRealtimeRecordReader realt | |||
| * Do schema evolution for ParquetFormat. | ||||
| */ | ||||
| public void doEvolutionForParquetFormat() { | ||||
| if (internalSchemaOption.isPresent()) { | ||||
| List<String> requiredColumns = getRequireColumn(job); | ||||
| // No need trigger schema evolution for count(*)/count(1) operation | ||||
| boolean disableSchemaEvolution = requiredColumns.isEmpty() || (requiredColumns.size() == 1 && requiredColumns.get(0).isEmpty()); | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To clarify, do
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. This is existing logic. Still wondering the same question. |
||||
| if (!disableSchemaEvolution) { | ||||
| if (!internalSchemaOption.isPresent()) { | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this condition be |
||||
| internalSchemaOption = new TableSchemaResolver(metaClient).getTableInternalSchemaFromCommitMetadata(); | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Still do try .. catch .. here in case the internal schema cannot be read? |
||||
| } | ||||
| // reading hoodie schema evolution table | ||||
| job.setBoolean(HIVE_EVOLUTION_ENABLE, true); | ||||
| Path finalPath = ((FileSplit)split).getPath(); | ||||
| Path finalPath = ((FileSplit) split).getPath(); | ||||
| InternalSchema prunedSchema; | ||||
| List<String> requiredColumns = getRequireColumn(job); | ||||
| // No need trigger schema evolution for count(*)/count(1) operation | ||||
| boolean disableSchemaEvolution = | ||||
| requiredColumns.isEmpty() || (requiredColumns.size() == 1 && requiredColumns.get(0).isEmpty()); | ||||
| if (!disableSchemaEvolution) { | ||||
| prunedSchema = InternalSchemaUtils.pruneInternalSchema(internalSchemaOption.get(), requiredColumns); | ||||
| InternalSchema querySchema = prunedSchema; | ||||
| Long commitTime = Long.valueOf(FSUtils.getCommitTime(finalPath.getName())); | ||||
| InternalSchema fileSchema = InternalSchemaCache.searchSchemaAndCache(commitTime, metaClient, false); | ||||
| InternalSchema mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchema, true, | ||||
| true).mergeSchema(); | ||||
| List<Types.Field> fields = mergedInternalSchema.columns(); | ||||
| setColumnNameList(job, fields); | ||||
| setColumnTypeList(job, fields); | ||||
| pushDownFilter(job, querySchema, fileSchema); | ||||
| } | ||||
| prunedSchema = InternalSchemaUtils.pruneInternalSchema(internalSchemaOption.get(), requiredColumns); | ||||
| InternalSchema querySchema = prunedSchema; | ||||
| Long commitTime = Long.valueOf(FSUtils.getCommitTime(finalPath.getName())); | ||||
| InternalSchema fileSchema = InternalSchemaCache.searchSchemaAndCache(commitTime, metaClient, false); | ||||
| InternalSchema mergedInternalSchema = new InternalSchemaMerger(fileSchema, querySchema, true, true).mergeSchema(); | ||||
| List<Types.Field> fields = mergedInternalSchema.columns(); | ||||
| setColumnNameList(job, fields); | ||||
| setColumnTypeList(job, fields); | ||||
| pushDownFilter(job, querySchema, fileSchema); | ||||
|
Comment on lines
161
to
+173
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and should this part be guarded by |
||||
| } | ||||
| } | ||||
|
|
||||
|
|
||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I assume we still need to fail here instead of printing the warning and letting it return?