-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Avoid computation of output size if underlying stats are not confident #21280
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -160,6 +160,10 @@ public double getOutputSizeInBytes(PlanNode planNode) | |
| if (!sourceInfo.estimateSizeUsingVariables() && !isNaN(totalSize)) { | ||
| return totalSize; | ||
| } | ||
| if (!isConfident()) { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Main change |
||
| // If we are not confident ( Non hbo stats + no row count info available) then we should not compute the output size | ||
| return NaN; | ||
| } | ||
|
|
||
| return getOutputSizeForVariables(planNode.getOutputVariables()); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,8 +36,23 @@ protected SimpleStatsRule(StatsNormalizer normalizer) | |
| @Override | ||
| public final Optional<PlanNodeStatsEstimate> calculate(T node, StatsProvider sourceStats, Lookup lookup, Session session, TypeProvider types) | ||
| { | ||
| return doCalculate(node, sourceStats, lookup, session, types) | ||
| Optional<PlanNodeStatsEstimate> planNodeStatsEstimate = doCalculate(node, sourceStats, lookup, session, types) | ||
| .map(estimate -> normalizer.normalize(estimate, node.getOutputVariables())); | ||
| if (node.getSources().isEmpty()) { | ||
| // dont do the confident check for tablescan stats | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Propagated up |
||
| return planNodeStatsEstimate; | ||
| } | ||
| boolean confident = sourceStats.getStats(node.getSources().get(0)).isConfident(); | ||
| for (PlanNode source : node.getSources()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Confidence level should not only depends on the source inputs, for example, EnforceSingleRowNode node should always be confident that the output is one single row. We need to exclude these rules from this check.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see maybe better to add this to an abstract method and override it in those rules |
||
| confident = sourceStats.getStats(source).isConfident(); | ||
| if (!confident) { | ||
| break; | ||
| } | ||
| } | ||
| boolean finalConfident = confident; | ||
| return planNodeStatsEstimate.map(p -> new PlanNodeStatsEstimate(p.getOutputRowCount(), | ||
| p.getTotalSize(), finalConfident, | ||
| p.getVariableStatistics(), p.getJoinNodeStatsEstimate(), p.getTableWriterNodeStatsEstimate())); | ||
| } | ||
|
|
||
| protected abstract Optional<PlanNodeStatsEstimate> doCalculate(T node, StatsProvider sourceStats, Lookup lookup, Session session, TypeProvider types); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,6 +59,10 @@ protected Optional<PlanNodeStatsEstimate> doCalculate(TableScanNode node, StatsP | |
| Constraint<ColumnHandle> constraint = new Constraint<>(node.getCurrentConstraint()); | ||
|
|
||
| TableStatistics tableStatistics = metadata.getTableStatistics(session, node.getTable(), ImmutableList.copyOf(node.getAssignments().values()), constraint); | ||
| if (tableStatistics.getRowCount().isUnknown()) { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If no variable statistics, we could set the confidence to be false but not sure that would be effective |
||
| // Since we do not have any hms statistics, we should not be confident | ||
| return Optional.of(PlanNodeStatsEstimate.unknown()); | ||
| } | ||
| Map<VariableReferenceExpression, VariableStatsEstimate> outputVariableStats = new HashMap<>(); | ||
|
|
||
| for (Map.Entry<VariableReferenceExpression, ColumnHandle> entry : node.getAssignments().entrySet()) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -67,7 +67,7 @@ public class TestGraphvizPrinter | |
| TupleDomain.all(), | ||
| TupleDomain.all()); | ||
| private static final String TEST_TABLE_SCAN_NODE_INNER_OUTPUT = format( | ||
| "label=\"{TableScan | [TableHandle \\{connectorId='%s', connectorHandle='%s', layout='Optional.empty'\\}]|Estimates: \\{rows: ? (0B), cpu: ?, memory: ?, network: ?\\}\n" + | ||
| "label=\"{TableScan | [TableHandle \\{connectorId='%s', connectorHandle='%s', layout='Optional.empty'\\}]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\}\n" + | ||
| "}\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue", | ||
| TEST_CONNECTOR_ID, | ||
| TEST_CONNECTOR_TABLE_HANDLE); | ||
|
|
@@ -133,12 +133,12 @@ public void testPrintDistributedFromFragments() | |
| String expected = "digraph distributed_plan {\n" + | ||
| "subgraph cluster_0 {\n" + | ||
| "label = \"SOURCE\"\n" + | ||
| "plannode_1[label=\"{TableScan | [TableHandle \\{connectorId='connector_id', connectorHandle='com.facebook.presto.testing.TestingMetadata$TestingTableHandle@1af56f7', layout='Optional.empty'\\}]|Estimates: \\{rows: ? (0B), cpu: ?, memory: ?, network: ?\\}\n" + | ||
| "plannode_1[label=\"{TableScan | [TableHandle \\{connectorId='connector_id', connectorHandle='com.facebook.presto.testing.TestingMetadata$TestingTableHandle@1af56f7', layout='Optional.empty'\\}]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\}\n" + | ||
| "}\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue];\n" + | ||
| "}\n" + | ||
| "subgraph cluster_1 {\n" + | ||
| "label = \"SOURCE\"\n" + | ||
| "plannode_1[label=\"{TableScan | [TableHandle \\{connectorId='connector_id', connectorHandle='com.facebook.presto.testing.TestingMetadata$TestingTableHandle@1af56f7', layout='Optional.empty'\\}]|Estimates: \\{rows: ? (0B), cpu: ?, memory: ?, network: ?\\}\n" + | ||
| "plannode_1[label=\"{TableScan | [TableHandle \\{connectorId='connector_id', connectorHandle='com.facebook.presto.testing.TestingMetadata$TestingTableHandle@1af56f7', layout='Optional.empty'\\}]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\}\n" + | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good change
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case, the size is 0B because the number of output variables is 0, i.e. no output at all, and outputing 0B sounds better here. |
||
| "}\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue];\n" + | ||
| "}\n" + | ||
| "}\n"; | ||
|
|
@@ -175,11 +175,11 @@ public void testPrintLogicalForJoinNode() | |
| String expected = "digraph logical_plan {\n" + | ||
| "subgraph cluster_0 {\n" + | ||
| "label = \"SOURCE\"\n" + | ||
| "plannode_1[label=\"{CrossJoin[REPLICATED]|Estimates: \\{rows: ? (0B), cpu: ?, memory: ?, network: ?\\}\n" + | ||
| "plannode_1[label=\"{CrossJoin[REPLICATED]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\}\n" + | ||
| "}\", style=\"rounded, filled\", shape=record, fillcolor=orange];\n" + | ||
| "plannode_2[label=\"{TableScan | [TableHandle \\{connectorId='connector_id', connectorHandle='com.facebook.presto.testing.TestingMetadata$TestingTableHandle@1af56f7', layout='Optional.empty'\\}]|Estimates: \\{rows: ? (0B), cpu: ?, memory: ?, network: ?\\}\n" + | ||
| "plannode_2[label=\"{TableScan | [TableHandle \\{connectorId='connector_id', connectorHandle='com.facebook.presto.testing.TestingMetadata$TestingTableHandle@1af56f7', layout='Optional.empty'\\}]|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\}\n" + | ||
| "}\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue];\n" + | ||
| "plannode_3[label=\"{Values|Estimates: \\{rows: ? (0B), cpu: ?, memory: ?, network: ?\\}\n" + | ||
| "plannode_3[label=\"{Values|Estimates: \\{rows: ? (?), cpu: ?, memory: ?, network: ?\\}\n" + | ||
| "}\", style=\"rounded, filled\", shape=record, fillcolor=deepskyblue];\n" + | ||
| "}\n" + | ||
| "plannode_1 -> plannode_3 [label = \"Build\"];\n" + //valuesNode should be the Build side | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In current logic, for case when input statistics for some table is unknown, it will match history which also has unknown statistics for the same table (and similar statistics for other table), otherwise will not match. This will make HBO to always return the latest run, even there are history with closer match, i.e. unknown for the same table and similar statistics for other tables.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see makes sense, i can remove this