-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-3974] Fix schema projection to skip non-existent preCombine field #5427
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,21 +25,25 @@ | |
| import org.apache.hudi.internal.schema.Types; | ||
| import org.apache.hudi.internal.schema.Types.Field; | ||
|
|
||
| import org.apache.log4j.LogManager; | ||
| import org.apache.log4j.Logger; | ||
|
|
||
| import java.util.ArrayList; | ||
| import java.util.Deque; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.TreeMap; | ||
| import java.util.SortedMap; | ||
| import java.util.Set; | ||
| import java.util.SortedMap; | ||
| import java.util.TreeMap; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| /** | ||
| * Util methods to help us do some operations on InternalSchema. | ||
| * eg: column prune, filter rebuild for query engine... | ||
| */ | ||
| public class InternalSchemaUtils { | ||
| private static final Logger LOG = LogManager.getLogger(InternalSchemaUtils.class); | ||
|
|
||
| private InternalSchemaUtils() { | ||
| } | ||
|
|
@@ -54,29 +58,75 @@ private InternalSchemaUtils() { | |
| */ | ||
| public static InternalSchema pruneInternalSchema(InternalSchema schema, List<String> names) { | ||
| // do check | ||
| List<Integer> prunedIds = names.stream().map(name -> { | ||
| List<Integer> prunedIds = names.stream() | ||
| .filter(name -> { | ||
| int id = schema.findIdByName(name); | ||
| if (id < 0) { | ||
| LOG.warn(String.format("cannot prune col: %s does not exist in hudi table", name)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. prior to this patch, we were throwing exception and now we are not? is this change intended? |
||
| return false; | ||
| } | ||
| return true; | ||
| }) | ||
| .map(schema::findIdByName).collect(Collectors.toList()); | ||
| // find top parent field ID. eg: a.b.c, f.g.h, only collect id of a and f ignore all child field. | ||
| List<Integer> topParentFieldIds = new ArrayList<>(); | ||
| names.stream().forEach(f -> { | ||
| int id = schema.findIdByName(f.split("\\.")[0]); | ||
| if (!topParentFieldIds.contains(id)) { | ||
| topParentFieldIds.add(id); | ||
| } | ||
| }); | ||
| return pruneInternalSchemaByID(schema, prunedIds, topParentFieldIds); | ||
| } | ||
|
|
||
| /** | ||
| * Create project internalSchema, based on the project names which produced by query engine and Hudi fields. | ||
| * support nested project. | ||
| * | ||
| * @param schema a internal schema. | ||
| * @param queryFields project names produced by query engine. | ||
| * @param hudiFields project names required by Hudi merging. | ||
| * @return a project internalSchema. | ||
| */ | ||
| public static InternalSchema pruneInternalSchema(InternalSchema schema, List<String> queryFields, List<String> hudiFields) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with the addition of this new method, is method at L 59 called anywhere? I expect all callers to use this instead of that? |
||
| // do check | ||
| List<Integer> allPrunedIds = queryFields.stream().map(name -> { | ||
| int id = schema.findIdByName(name); | ||
| if (id == -1) { | ||
| throw new IllegalArgumentException(String.format("cannot prune col: %s which not exisit in hudi table", name)); | ||
| throw new IllegalArgumentException(String.format("Cannot prune col from query: %s does not exist in hudi table", name)); | ||
| } | ||
| return id; | ||
| }).collect(Collectors.toList()); | ||
| List<String> allFields = new ArrayList<>(queryFields); | ||
| // Filter non-existent Hudi fields | ||
| List<String> filteredHudiFields = hudiFields.stream() | ||
| .filter(name -> { | ||
| int id = schema.findIdByName(name); | ||
| if (id < 0) { | ||
| LOG.warn(String.format("Cannot prune col from Hudi: %s does not exist in hudi table", name)); | ||
| return false; | ||
| } | ||
| return true; | ||
| }).collect(Collectors.toList()); | ||
| allFields.addAll(filteredHudiFields); | ||
| allPrunedIds.addAll(filteredHudiFields.stream() | ||
| .map(schema::findIdByName).collect(Collectors.toList())); | ||
| // find top parent field ID. eg: a.b.c, f.g.h, only collect id of a and f ignore all child field. | ||
| List<Integer> topParentFieldIds = new ArrayList<>(); | ||
| names.stream().forEach(f -> { | ||
| allFields.forEach(f -> { | ||
| int id = schema.findIdByName(f.split("\\.")[0]); | ||
| if (!topParentFieldIds.contains(id)) { | ||
| topParentFieldIds.add(id); | ||
| } | ||
| }); | ||
| return pruneInternalSchemaByID(schema, prunedIds, topParentFieldIds); | ||
| return pruneInternalSchemaByID(schema, allPrunedIds, topParentFieldIds); | ||
| } | ||
|
|
||
| /** | ||
| * Create project internalSchema. | ||
| * support nested project. | ||
| * | ||
| * @param schema a internal schema. | ||
| * @param schema a internal schema. | ||
| * @param fieldIds project col field_ids. | ||
| * @return a project internalSchema. | ||
| */ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you help me understand something. I understand if non existant preCombine is part of the names, we ignore it.
But if someone does a query "select a,b,c from tbl", where b does not even exist in the table, we have to throw exception. Can you confirm that is not affected by this fix here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, the filtering should not happen after a second thought. I'm going to rethink how to make the fix.