-
Notifications
You must be signed in to change notification settings - Fork 4.8k
[WIP] Iceberg: Do not use HMS stats when statsSource is Iceberg #5400
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -118,7 +118,6 @@ | |
| import org.apache.hadoop.hive.ql.security.authorization.HiveCustomStorageHandlerUtils; | ||
| import org.apache.hadoop.hive.ql.session.SessionState; | ||
| import org.apache.hadoop.hive.ql.session.SessionStateUtil; | ||
| import org.apache.hadoop.hive.ql.stats.Partish; | ||
| import org.apache.hadoop.hive.serde2.AbstractSerDe; | ||
| import org.apache.hadoop.hive.serde2.DefaultFetchFormatter; | ||
| import org.apache.hadoop.hive.serde2.Deserializer; | ||
|
|
@@ -450,44 +449,45 @@ public void appendFiles(org.apache.hadoop.hive.metastore.api.Table table, URI fr | |
| } | ||
|
|
||
| @Override | ||
| public Map<String, String> getBasicStatistics(Partish partish) { | ||
| org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable(); | ||
| public Map<String, String> getBasicStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) { | ||
| // For write queries where rows got modified, don't fetch from cache as values could have changed. | ||
| Table table = getTable(hmsTable); | ||
| Map<String, String> stats = Maps.newHashMap(); | ||
| if (getStatsSource().equals(HiveMetaHook.ICEBERG)) { | ||
| if (table.currentSnapshot() != null) { | ||
| Map<String, String> summary = table.currentSnapshot().summary(); | ||
| if (summary != null) { | ||
| if (!getStatsSource().equals(HiveMetaHook.ICEBERG)) { | ||
| return hmsTable.getParameters(); | ||
| } | ||
| Table table = getTable(hmsTable); | ||
|
|
||
| if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) { | ||
| stats.put(StatsSetupConst.NUM_FILES, summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); | ||
| } | ||
| if (table.currentSnapshot() != null) { | ||
| Map<String, String> summary = table.currentSnapshot().summary(); | ||
| if (summary != null) { | ||
|
|
||
| if (summary.containsKey(SnapshotSummary.TOTAL_RECORDS_PROP)) { | ||
| long totalRecords = Long.parseLong(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP)); | ||
| if (summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) && | ||
| summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) { | ||
| if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) { | ||
| stats.put(StatsSetupConst.NUM_FILES, summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP)); | ||
| } | ||
|
|
||
| long totalEqDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_EQ_DELETES_PROP)); | ||
| long totalPosDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_POS_DELETES_PROP)); | ||
| if (summary.containsKey(SnapshotSummary.TOTAL_RECORDS_PROP)) { | ||
| long totalRecords = Long.parseLong(summary.get(SnapshotSummary.TOTAL_RECORDS_PROP)); | ||
| if (summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) && | ||
| summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) { | ||
|
|
||
| long actualRecords = totalRecords - (totalEqDeletes > 0 ? 0 : totalPosDeletes); | ||
| totalRecords = actualRecords > 0 ? actualRecords : totalRecords; | ||
| // actualRecords maybe -ve in edge cases | ||
| } | ||
| stats.put(StatsSetupConst.ROW_COUNT, String.valueOf(totalRecords)); | ||
| } | ||
| long totalEqDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_EQ_DELETES_PROP)); | ||
| long totalPosDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_POS_DELETES_PROP)); | ||
|
|
||
| if (summary.containsKey(SnapshotSummary.TOTAL_FILE_SIZE_PROP)) { | ||
| stats.put(StatsSetupConst.TOTAL_SIZE, summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP)); | ||
| long actualRecords = totalRecords - (totalEqDeletes > 0 ? 0 : totalPosDeletes); | ||
| totalRecords = actualRecords > 0 ? actualRecords : totalRecords; | ||
| // actualRecords maybe -ve in edge cases | ||
| } | ||
| stats.put(StatsSetupConst.ROW_COUNT, String.valueOf(totalRecords)); | ||
| } | ||
|
|
||
| if (summary.containsKey(SnapshotSummary.TOTAL_FILE_SIZE_PROP)) { | ||
| stats.put(StatsSetupConst.TOTAL_SIZE, summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP)); | ||
| } | ||
| } else { | ||
| stats.put(StatsSetupConst.NUM_FILES, "0"); | ||
| stats.put(StatsSetupConst.ROW_COUNT, "0"); | ||
| stats.put(StatsSetupConst.TOTAL_SIZE, "0"); | ||
| } | ||
| } else { | ||
| stats.put(StatsSetupConst.NUM_FILES, "0"); | ||
| stats.put(StatsSetupConst.ROW_COUNT, "0"); | ||
| stats.put(StatsSetupConst.TOTAL_SIZE, "0"); | ||
| } | ||
| return stats; | ||
| } | ||
|
|
@@ -600,7 +600,10 @@ private ColumnStatistics readColStats(Table table, Path statsPath) { | |
|
|
||
| @Override | ||
| public boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) { | ||
| if (getStatsSource().equals(HiveMetaHook.ICEBERG) && hmsTable.getMetaTable() == null) { | ||
| if (hmsTable.getMetaTable() != null) { | ||
| return false; | ||
| } | ||
| if (getStatsSource().equals(HiveMetaHook.ICEBERG)) { | ||
| Table table = getTable(hmsTable); | ||
| if (table.currentSnapshot() != null) { | ||
| Map<String, String> summary = table.currentSnapshot().summary(); | ||
|
|
@@ -613,7 +616,7 @@ public boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Tabl | |
| } | ||
| } | ||
| } | ||
| return false; | ||
| return true; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In case of delete files, if Iceberg statsSource can not compute query( eg. count(*)) using stats, i think HMS stats can't either. They both come from the same place -- We should not give a wrong impression that HMS can give the accurate stats.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. even if we do alter table compute stats?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch! And after the job But if the statsSource is Iceberg & in case of delete files, even we have done the job This will look a little weird. Users do a job |
||
| } | ||
|
|
||
| private String getStatsSource() { | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -945,7 +945,10 @@ private Long getRowCnt( | |||||||||||
| if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(tbl, tbl.getParameters())) { | ||||||||||||
| return null; | ||||||||||||
| } | ||||||||||||
| rowCnt = Long.valueOf(tbl.getProperty(StatsSetupConst.ROW_COUNT)); | ||||||||||||
| Map<String, String> basicStats = MetaStoreUtils.isNonNativeTable(tbl.getTTable()) ? | ||||||||||||
| tbl.getStorageHandler().getBasicStatistics(tbl) : tbl.getParameters(); | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discuss: hive/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java Line 174 in 4f7200d
hive/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java Lines 225 to 227 in 4f7200d
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh sorry, I missed the #5215
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, i think
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @zhangbutao, could you please check the comments in #5215 and maybe incorporate changes from this PR into yours
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Iceberg table with equal delets should not be optimized by the
#5215 I want to optimize the |
||||||||||||
|
|
||||||||||||
| rowCnt = Long.valueOf(basicStats.get(StatsSetupConst.ROW_COUNT)); | ||||||||||||
| } | ||||||||||||
| return rowCnt; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Query against Iceberg Branch/Tag can also benefit from the stats. We can optimize this later.