From d22026f3af87f9f9a0907affe485577369d80589 Mon Sep 17 00:00:00 2001 From: feilong-liu Date: Fri, 3 May 2024 09:39:39 -0700 Subject: [PATCH 1/4] Revert "Add histograms for optimizer cost calculation" This reverts commit e8c465902a6b4025721d9fd4bf7936a155ba6a79. --- .../planner/AbstractCostBasedPlanTest.java | 62 +- .../sql/presto/tpcds/histogram/q85.plan.txt | 38 - .../hive/TestHiveIntegrationSmokeTest.java | 448 +++--- .../hive/TestParquetDistributedQueries.java | 24 +- .../facebook/presto/hive/TestShowStats.java | 116 +- .../IcebergDistributedSmokeTestBase.java | 20 +- .../presto/SystemSessionProperties.java | 14 +- .../cost/ComparisonStatsCalculator.java | 92 +- ...ConnectorFilterStatsCalculatorService.java | 9 +- .../cost/DisjointRangeDomainHistogram.java | 352 ----- .../presto/cost/FilterStatsCalculator.java | 21 +- .../presto/cost/HistogramCalculator.java | 167 --- .../facebook/presto/cost/JoinStatsRule.java | 3 - .../cost/PlanNodeStatsEstimateMath.java | 55 +- .../facebook/presto/cost/StatisticRange.java | 107 +- .../com/facebook/presto/cost/StatsUtil.java | 1 - .../cost/UniformDistributionHistogram.java | 148 -- .../presto/cost/VariableStatsEstimate.java | 39 +- .../presto/sql/analyzer/FeaturesConfig.java | 14 - .../presto/sql/rewrite/ShowStatsRewrite.java | 4 - ...ava => TestComparisonStatsCalculator.java} | 12 +- ...stComparisonStatsCalculatorHistograms.java | 23 - ...ComparisonStatsCalculatorNoHistograms.java | 23 - ...ConnectorFilterStatsCalculatorService.java | 6 +- .../TestDisjointRangeDomainHistogram.java | 288 ---- ...or.java => TestFilterStatsCalculator.java} | 58 +- .../TestFilterStatsCalculatorHistograms.java | 70 - ...TestFilterStatsCalculatorNoHistograms.java | 23 - .../facebook/presto/cost/TestHistogram.java | 88 -- .../presto/cost/TestHistogramCalculator.java | 100 -- .../cost/TestPlanNodeStatsEstimateMath.java | 59 - .../presto/cost/TestStatisticRange.java | 57 - .../presto/cost/TestUniformHistogram.java | 116 -- .../TestJsonPrestoQueryPlanFunctionUtils.java | 133 +- .../sql/analyzer/TestFeaturesConfig.java | 3 - .../sql/planner/assertions/BasePlanTest.java | 7 +- .../AbstractTestNativeGeneralQueries.java | 30 +- .../nativeworker/AbstractTestWriter.java | 96 +- .../tests/hive/TestExternalHiveTable.java | 20 +- .../tests/hive/TestHiveTableStatistics.java | 1298 ++++++++--------- .../spi/statistics/ColumnStatisticType.java | 3 +- .../spi/statistics/ColumnStatistics.java | 58 +- .../spi/statistics/ConnectorHistogram.java | 64 - .../presto/spi/statistics/Estimate.java | 71 - .../presto/tests/TestLocalQueries.java | 12 +- 45 files changed, 1155 insertions(+), 3297 deletions(-) delete mode 100644 presto-benchto-benchmarks/src/test/resources/sql/presto/tpcds/histogram/q85.plan.txt delete mode 100644 presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java delete mode 100644 presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java delete mode 100644 presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java rename presto-main/src/test/java/com/facebook/presto/cost/{AbstractTestComparisonStatsCalculator.java => TestComparisonStatsCalculator.java} (99%) delete mode 100644 presto-main/src/test/java/com/facebook/presto/cost/TestComparisonStatsCalculatorHistograms.java delete mode 100644 presto-main/src/test/java/com/facebook/presto/cost/TestComparisonStatsCalculatorNoHistograms.java delete mode 100644 presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java rename presto-main/src/test/java/com/facebook/presto/cost/{AbstractTestFilterStatsCalculator.java => TestFilterStatsCalculator.java} (97%) delete mode 100644 presto-main/src/test/java/com/facebook/presto/cost/TestFilterStatsCalculatorHistograms.java delete mode 100644 presto-main/src/test/java/com/facebook/presto/cost/TestFilterStatsCalculatorNoHistograms.java delete mode 100644 presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java delete mode 100644 presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java delete mode 100644 presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java delete mode 100644 presto-spi/src/main/java/com/facebook/presto/spi/statistics/ConnectorHistogram.java diff --git a/presto-benchto-benchmarks/src/test/java/com/facebook/presto/sql/planner/AbstractCostBasedPlanTest.java b/presto-benchto-benchmarks/src/test/java/com/facebook/presto/sql/planner/AbstractCostBasedPlanTest.java index 39b0a85956935..46ed546dcd607 100644 --- a/presto-benchto-benchmarks/src/test/java/com/facebook/presto/sql/planner/AbstractCostBasedPlanTest.java +++ b/presto-benchto-benchmarks/src/test/java/com/facebook/presto/sql/planner/AbstractCostBasedPlanTest.java @@ -14,7 +14,6 @@ package com.facebook.presto.sql.planner; -import com.facebook.presto.Session; import com.facebook.presto.spi.ConnectorTableHandle; import com.facebook.presto.spi.plan.AggregationNode; import com.facebook.presto.spi.plan.JoinDistributionType; @@ -38,7 +37,6 @@ import java.nio.file.Paths; import java.util.stream.Stream; -import static com.facebook.presto.SystemSessionProperties.OPTIMIZER_USE_HISTOGRAMS; import static com.facebook.presto.spi.plan.JoinDistributionType.REPLICATED; import static com.facebook.presto.spi.plan.JoinType.INNER; import static com.facebook.presto.sql.Optimizer.PlanStage.OPTIMIZED_AND_VALIDATED; @@ -78,42 +76,11 @@ public void test(String queryResourcePath) assertEquals(generateQueryPlan(read(queryResourcePath)), read(getQueryPlanResourcePath(queryResourcePath))); } - @Test(dataProvider = "getQueriesDataProvider") - public void histogramsPlansMatch(String queryResourcePath) - { - String sql = read(queryResourcePath); - Session histogramSession = Session.builder(getQueryRunner().getDefaultSession()) - .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") - .build(); - Session noHistogramSession = Session.builder(getQueryRunner().getDefaultSession()) - .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "false") - .build(); - String regularPlan = generateQueryPlan(sql, noHistogramSession); - String histogramPlan = generateQueryPlan(sql, histogramSession); - if (!regularPlan.equals(histogramPlan)) { - assertEquals(histogramPlan, read(getHistogramPlanResourcePath(getQueryPlanResourcePath(queryResourcePath)))); - } - } - private String getQueryPlanResourcePath(String queryResourcePath) { return queryResourcePath.replaceAll("\\.sql$", ".plan.txt"); } - private String getHistogramPlanResourcePath(String regularPlanResourcePath) - { - Path root = Paths.get(regularPlanResourcePath); - return root.getParent().resolve("histogram/" + root.getFileName()).toString(); - } - - private Path getResourceWritePath(String queryResourcePath) - { - return Paths.get( - getSourcePath().toString(), - "src/test/resources", - getQueryPlanResourcePath(queryResourcePath)); - } - public void generate() throws Exception { @@ -123,24 +90,12 @@ public void generate() .parallel() .forEach(queryResourcePath -> { try { - Path queryPlanWritePath = getResourceWritePath(queryResourcePath); + Path queryPlanWritePath = Paths.get( + getSourcePath().toString(), + "src/test/resources", + getQueryPlanResourcePath(queryResourcePath)); createParentDirs(queryPlanWritePath.toFile()); - Session histogramSession = Session.builder(getQueryRunner().getDefaultSession()) - .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") - .build(); - Session noHistogramSession = Session.builder(getQueryRunner().getDefaultSession()) - .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "false") - .build(); - String sql = read(queryResourcePath); - String regularPlan = generateQueryPlan(sql, noHistogramSession); - String histogramPlan = generateQueryPlan(sql, histogramSession); - write(regularPlan.getBytes(UTF_8), queryPlanWritePath.toFile()); - // write out the histogram plan if it differs - if (!regularPlan.equals(histogramPlan)) { - Path histogramPlanWritePath = getResourceWritePath(getHistogramPlanResourcePath(queryResourcePath)); - createParentDirs(histogramPlanWritePath.toFile()); - write(histogramPlan.getBytes(UTF_8), histogramPlanWritePath.toFile()); - } + write(generateQueryPlan(read(queryResourcePath)).getBytes(UTF_8), queryPlanWritePath.toFile()); System.out.println("Generated expected plan for query: " + queryResourcePath); } catch (IOException e) { @@ -164,16 +119,11 @@ private static String read(String resource) } private String generateQueryPlan(String query) - { - return generateQueryPlan(query, getQueryRunner().getDefaultSession()); - } - - private String generateQueryPlan(String query, Session session) { String sql = query.replaceAll("\\s+;\\s+$", "") .replace("${database}.${schema}.", "") .replace("\"${database}\".\"${schema}\".\"${prefix}", "\""); - Plan plan = plan(session, sql, OPTIMIZED_AND_VALIDATED, false); + Plan plan = plan(sql, OPTIMIZED_AND_VALIDATED, false); JoinOrderPrinter joinOrderPrinter = new JoinOrderPrinter(); plan.getRoot().accept(joinOrderPrinter, 0); diff --git a/presto-benchto-benchmarks/src/test/resources/sql/presto/tpcds/histogram/q85.plan.txt b/presto-benchto-benchmarks/src/test/resources/sql/presto/tpcds/histogram/q85.plan.txt deleted file mode 100644 index 9da44b017a7a7..0000000000000 --- a/presto-benchto-benchmarks/src/test/resources/sql/presto/tpcds/histogram/q85.plan.txt +++ /dev/null @@ -1,38 +0,0 @@ -local exchange (GATHER, SINGLE, []) - remote exchange (GATHER, SINGLE, []) - final aggregation over (r_reason_desc) - local exchange (GATHER, SINGLE, []) - remote exchange (REPARTITION, HASH, [r_reason_desc]) - partial aggregation over (r_reason_desc) - join (INNER, REPLICATED): - join (INNER, REPLICATED): - join (INNER, PARTITIONED): - remote exchange (REPARTITION, HASH, [cd_demo_sk, cd_education_status, cd_marital_status]) - scan customer_demographics - local exchange (GATHER, SINGLE, []) - remote exchange (REPARTITION, HASH, [cd_education_status_3, cd_marital_status_2, wr_refunded_cdemo_sk]) - join (INNER, PARTITIONED): - remote exchange (REPARTITION, HASH, [wr_refunded_addr_sk]) - join (INNER, PARTITIONED): - remote exchange (REPARTITION, HASH, [ws_item_sk, ws_order_number]) - join (INNER, REPLICATED): - scan web_sales - local exchange (GATHER, SINGLE, []) - remote exchange (REPLICATE, BROADCAST, []) - scan date_dim - local exchange (GATHER, SINGLE, []) - remote exchange (REPARTITION, HASH, [wr_item_sk, wr_order_number]) - join (INNER, REPLICATED): - scan web_returns - local exchange (GATHER, SINGLE, []) - remote exchange (REPLICATE, BROADCAST, []) - scan customer_demographics - local exchange (GATHER, SINGLE, []) - remote exchange (REPARTITION, HASH, [ca_address_sk]) - scan customer_address - local exchange (GATHER, SINGLE, []) - remote exchange (REPLICATE, BROADCAST, []) - scan web_page - local exchange (GATHER, SINGLE, []) - remote exchange (REPLICATE, BROADCAST, []) - scan reason diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java index 2a978245afc28..b853262748fdb 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java @@ -4467,39 +4467,39 @@ public void testCollectColumnStatisticsOnCreateTable() assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1')", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1', null), " + - "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2', null), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null, null), " + - "('c_array', 176.0E0, null, 0.5, null, null, null, null), " + - "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)"); + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + + "('c_array', 176.0E0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2')", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2', null), " + - "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3', null), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null, null), " + - "('c_array', 96.0E0, null, 0.5, null, null, null, null), " + - "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)"); + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + + "('c_array', 96.0E0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)"); // non existing partition assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3')", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 0E0, 0E0, null, null, null, null), " + - "('c_bigint', null, 0E0, 0E0, null, null, null, null), " + - "('c_double', null, 0E0, 0E0, null, null, null, null), " + - "('c_timestamp', null, 0E0, 0E0, null, null, null, null), " + - "('c_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + - "('c_varbinary', null, 0E0, 0E0, null, null, null, null), " + - "('c_array', null, 0E0, 0E0, null, null, null, null), " + - "('p_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + - "(null, null, null, null, 0E0, null, null, null)"); + "('c_boolean', null, 0E0, 0E0, null, null, null), " + + "('c_bigint', null, 0E0, 0E0, null, null, null), " + + "('c_double', null, 0E0, 0E0, null, null, null), " + + "('c_timestamp', null, 0E0, 0E0, null, null, null), " + + "('c_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "('c_varbinary', null, 0E0, 0E0, null, null, null), " + + "('c_array', null, 0E0, 0E0, null, null, null), " + + "('p_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "(null, null, null, null, 0E0, null, null)"); assertUpdate(format("DROP TABLE %s", tableName)); } @@ -4540,39 +4540,39 @@ public void testCollectColumnStatisticsOnInsert() assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1')", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1', null), " + - "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2', null), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null, null), " + - "('c_array', 176.0E0, null, 0.5E0, null, null, null, null), " + - "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)"); + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + + "('c_array', 176.0E0, null, 0.5E0, null, null, null), " + + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2')", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2', null), " + - "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3', null), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null, null), " + - "('c_array', 96.0E0, null, 0.5E0, null, null, null, null), " + - "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)"); + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + + "('c_array', 96.0E0, null, 0.5E0, null, null, null), " + + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)"); // non existing partition assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3')", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 0E0, 0E0, null, null, null, null), " + - "('c_bigint', null, 0E0, 0E0, null, null, null, null), " + - "('c_double', null, 0E0, 0E0, null, null, null, null), " + - "('c_timestamp', null, 0E0, 0E0, null, null, null, null), " + - "('c_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + - "('c_varbinary', null, 0E0, 0E0, null, null, null, null), " + - "('c_array', null, 0E0, 0E0, null, null, null, null), " + - "('p_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + - "(null, null, null, null, 0E0, null, null, null)"); + "('c_boolean', null, 0E0, 0E0, null, null, null), " + + "('c_bigint', null, 0E0, 0E0, null, null, null), " + + "('c_double', null, 0E0, 0E0, null, null, null), " + + "('c_timestamp', null, 0E0, 0E0, null, null, null), " + + "('c_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "('c_varbinary', null, 0E0, 0E0, null, null, null), " + + "('c_array', null, 0E0, 0E0, null, null, null), " + + "('p_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "(null, null, null, null, 0E0, null, null)"); assertUpdate(format("DROP TABLE %s", tableName)); } @@ -4646,109 +4646,109 @@ public void testAnalyzePartitionedTable() // No column stats before running analyze assertQuery("SHOW STATS FOR " + tableName, "SELECT * FROM VALUES " + - "('c_boolean', null, null, null, null, null, null, null), " + - "('c_bigint', null, null, null, null, null, null, null), " + - "('c_double', null, null, null, null, null, null, null), " + - "('c_timestamp', null, null, null, null, null, null, null), " + - "('c_varchar', null, null, null, null, null, null, null), " + - "('c_varbinary', null, null, null, null, null, null, null), " + - "('c_array', null, null, null, null, null, null, null), " + - "('p_varchar', 24.0, 3.0, 0.25, null, null, null, null), " + - "('p_bigint', null, 2.0, 0.25, null, '7', '8', null), " + - "(null, null, null, null, 16.0, null, null, null)"); + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('c_array', null, null, null, null, null, null), " + + "('p_varchar', 24.0, 3.0, 0.25, null, null, null), " + + "('p_bigint', null, 2.0, 0.25, null, '7', '8'), " + + "(null, null, null, null, 16.0, null, null)"); // No column stats after running an empty analyze assertUpdate(format("ANALYZE %s WITH (partitions = ARRAY[])", tableName), 0); assertQuery("SHOW STATS FOR " + tableName, "SELECT * FROM VALUES " + - "('c_boolean', null, null, null, null, null, null, null), " + - "('c_bigint', null, null, null, null, null, null, null), " + - "('c_double', null, null, null, null, null, null, null), " + - "('c_timestamp', null, null, null, null, null, null, null), " + - "('c_varchar', null, null, null, null, null, null, null), " + - "('c_varbinary', null, null, null, null, null, null, null), " + - "('c_array', null, null, null, null, null, null, null), " + - "('p_varchar', 24.0, 3.0, 0.25, null, null, null, null), " + - "('p_bigint', null, 2.0, 0.25, null, '7', '8', null), " + - "(null, null, null, null, 16.0, null, null, null)"); + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('c_array', null, null, null, null, null, null), " + + "('p_varchar', 24.0, 3.0, 0.25, null, null, null), " + + "('p_bigint', null, 2.0, 0.25, null, '7', '8'), " + + "(null, null, null, null, 16.0, null, null)"); // Run analyze on 3 partitions including a null partition and a duplicate partition assertUpdate(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['p1', '7'], ARRAY['p2', '7'], ARRAY['p2', '7'], ARRAY[NULL, NULL]])", tableName), 12); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1' AND p_bigint = 7)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 2.0, 0.5, null, null, null, null), " + - "('c_bigint', null, 2.0, 0.5, null, '0', '1', null), " + - "('c_double', null, 2.0, 0.5, null, '1.2', '2.2', null), " + - "('c_timestamp', null, 2.0, 0.5, null, null, null, null), " + - "('c_varchar', 8.0, 2.0, 0.5, null, null, null, null), " + - "('c_varbinary', 4.0, null, 0.5, null, null, null, null), " + - "('c_array', 176.0, null, 0.5, null, null, null, null), " + - "('p_varchar', 8.0, 1.0, 0.0, null, null, null, null), " + - "('p_bigint', null, 1.0, 0.0, null, '7', '7', null), " + - "(null, null, null, null, 4.0, null, null, null)"); + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '0', '1'), " + + "('c_double', null, 2.0, 0.5, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('c_array', 176.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2' AND p_bigint = 7)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 2.0, 0.5, null, null, null, null), " + - "('c_bigint', null, 2.0, 0.5, null, '1', '2', null), " + - "('c_double', null, 2.0, 0.5, null, '2.3', '3.3', null), " + - "('c_timestamp', null, 2.0, 0.5, null, null, null, null), " + - "('c_varchar', 8.0, 2.0, 0.5, null, null, null, null), " + - "('c_varbinary', 4.0, null, 0.5, null, null, null, null), " + - "('c_array', 96.0, null, 0.5, null, null, null, null), " + - "('p_varchar', 8.0, 1.0, 0.0, null, null, null, null), " + - "('p_bigint', null, 1.0, 0.0, null, '7', '7', null), " + - "(null, null, null, null, 4.0, null, null, null)"); + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '1', '2'), " + + "('c_double', null, 2.0, 0.5, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('c_array', 96.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar IS NULL AND p_bigint IS NULL)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 1.0, 0.0, null, null, null, null), " + - "('c_bigint', null, 4.0, 0.0, null, '4', '7', null), " + - "('c_double', null, 4.0, 0.0, null, '4.7', '7.7', null), " + - "('c_timestamp', null, 4.0, 0.0, null, null, null, null), " + - "('c_varchar', 16.0, 4.0, 0.0, null, null, null, null), " + - "('c_varbinary', 8.0, null, 0.0, null, null, null, null), " + - "('c_array', 192.0, null, 0.0, null, null, null, null), " + - "('p_varchar', 0.0, 0.0, 1.0, null, null, null, null), " + - "('p_bigint', null, 0.0, 1.0, null, null, null, null), " + - "(null, null, null, null, 4.0, null, null, null)"); + "('c_boolean', null, 1.0, 0.0, null, null, null), " + + "('c_bigint', null, 4.0, 0.0, null, '4', '7'), " + + "('c_double', null, 4.0, 0.0, null, '4.7', '7.7'), " + + "('c_timestamp', null, 4.0, 0.0, null, null, null), " + + "('c_varchar', 16.0, 4.0, 0.0, null, null, null), " + + "('c_varbinary', 8.0, null, 0.0, null, null, null), " + + "('c_array', 192.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 1.0, null, null, null), " + + "('p_bigint', null, 0.0, 1.0, null, null, null), " + + "(null, null, null, null, 4.0, null, null)"); // Partition [p3, 8], [e1, 9], [e2, 9] have no column stats assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3' AND p_bigint = 8)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, null, null, null, null, null, null), " + - "('c_bigint', null, null, null, null, null, null, null), " + - "('c_double', null, null, null, null, null, null, null), " + - "('c_timestamp', null, null, null, null, null, null, null), " + - "('c_varchar', null, null, null, null, null, null, null), " + - "('c_varbinary', null, null, null, null, null, null, null), " + - "('c_array', null, null, null, null, null, null, null), " + - "('p_varchar', 8.0, 1.0, 0.0, null, null, null, null), " + - "('p_bigint', null, 1.0, 0.0, null, '8', '8', null), " + - "(null, null, null, null, 4.0, null, null, null)"); + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('c_array', null, null, null, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '8', '8'), " + + "(null, null, null, null, 4.0, null, null)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e1' AND p_bigint = 9)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, null, null, null, null, null, null), " + - "('c_bigint', null, null, null, null, null, null, null), " + - "('c_double', null, null, null, null, null, null, null), " + - "('c_timestamp', null, null, null, null, null, null, null), " + - "('c_varchar', null, null, null, null, null, null, null), " + - "('c_varbinary', null, null, null, null, null, null, null), " + - "('c_array', null, null, null, null, null, null, null), " + - "('p_varchar', 0.0, 0.0, 0.0, null, null, null, null), " + - "('p_bigint', null, 0.0, 0.0, null, null, null, null), " + - "(null, null, null, null, 0.0, null, null, null)"); + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('c_array', null, null, null, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e2' AND p_bigint = 9)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, null, null, null, null, null, null), " + - "('c_bigint', null, null, null, null, null, null, null), " + - "('c_double', null, null, null, null, null, null, null), " + - "('c_timestamp', null, null, null, null, null, null, null), " + - "('c_varchar', null, null, null, null, null, null, null), " + - "('c_varbinary', null, null, null, null, null, null, null), " + - "('c_array', null, null, null, null, null, null, null), " + - "('p_varchar', 0.0, 0.0, 0.0, null, null, null, null), " + - "('p_bigint', null, 0.0, 0.0, null, null, null, null), " + - "(null, null, null, null, 0.0, null, null, null)"); + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('c_array', null, null, null, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); // Run analyze on the whole table assertUpdate("ANALYZE " + tableName, 16); @@ -4756,76 +4756,76 @@ public void testAnalyzePartitionedTable() // All partitions except empty partitions have column stats assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1' AND p_bigint = 7)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 2.0, 0.5, null, null, null, null), " + - "('c_bigint', null, 2.0, 0.5, null, '0', '1', null), " + - "('c_double', null, 2.0, 0.5, null, '1.2', '2.2', null), " + - "('c_timestamp', null, 2.0, 0.5, null, null, null, null), " + - "('c_varchar', 8.0, 2.0, 0.5, null, null, null, null), " + - "('c_varbinary', 4.0, null, 0.5, null, null, null, null), " + - "('c_array', 176.0, null, 0.5, null, null, null, null), " + - "('p_varchar', 8.0, 1.0, 0.0, null, null, null, null), " + - "('p_bigint', null, 1.0, 0.0, null, '7', '7', null), " + - "(null, null, null, null, 4.0, null, null, null)"); + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '0', '1'), " + + "('c_double', null, 2.0, 0.5, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('c_array', 176.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2' AND p_bigint = 7)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 2.0, 0.5, null, null, null, null), " + - "('c_bigint', null, 2.0, 0.5, null, '1', '2', null), " + - "('c_double', null, 2.0, 0.5, null, '2.3', '3.3', null), " + - "('c_timestamp', null, 2.0, 0.5, null, null, null, null), " + - "('c_varchar', 8.0, 2.0, 0.5, null, null, null, null), " + - "('c_varbinary', 4.0, null, 0.5, null, null, null, null), " + - "('c_array', 96.0, null, 0.5, null, null, null, null), " + - "('p_varchar', 8.0, 1.0, 0.0, null, null, null, null), " + - "('p_bigint', null, 1.0, 0.0, null, '7', '7', null), " + - "(null, null, null, null, 4.0, null, null, null)"); + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '1', '2'), " + + "('c_double', null, 2.0, 0.5, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('c_array', 96.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar IS NULL AND p_bigint IS NULL)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 1.0, 0.0, null, null, null, null), " + - "('c_bigint', null, 4.0, 0.0, null, '4', '7', null), " + - "('c_double', null, 4.0, 0.0, null, '4.7', '7.7', null), " + - "('c_timestamp', null, 4.0, 0.0, null, null, null, null), " + - "('c_varchar', 16.0, 4.0, 0.0, null, null, null, null), " + - "('c_varbinary', 8.0, null, 0.0, null, null, null, null), " + - "('c_array', 192.0, null, 0.0, null, null, null, null), " + - "('p_varchar', 0.0, 0.0, 1.0, null, null, null, null), " + - "('p_bigint', null, 0.0, 1.0, null, null, null, null), " + - "(null, null, null, null, 4.0, null, null, null)"); + "('c_boolean', null, 1.0, 0.0, null, null, null), " + + "('c_bigint', null, 4.0, 0.0, null, '4', '7'), " + + "('c_double', null, 4.0, 0.0, null, '4.7', '7.7'), " + + "('c_timestamp', null, 4.0, 0.0, null, null, null), " + + "('c_varchar', 16.0, 4.0, 0.0, null, null, null), " + + "('c_varbinary', 8.0, null, 0.0, null, null, null), " + + "('c_array', 192.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 1.0, null, null, null), " + + "('p_bigint', null, 0.0, 1.0, null, null, null), " + + "(null, null, null, null, 4.0, null, null)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3' AND p_bigint = 8)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 2.0, 0.5, null, null, null, null), " + - "('c_bigint', null, 2.0, 0.5, null, '2', '3', null), " + - "('c_double', null, 2.0, 0.5, null, '3.4', '4.4', null), " + - "('c_timestamp', null, 2.0, 0.5, null, null, null, null), " + - "('c_varchar', 8.0, 2.0, 0.5, null, null, null, null), " + - "('c_varbinary', 4.0, null, 0.5, null, null, null, null), " + - "('c_array', 96.0, null, 0.5, null, null, null, null), " + - "('p_varchar', 8.0, 1.0, 0.0, null, null, null, null), " + - "('p_bigint', null, 1.0, 0.0, null, '8', '8', null), " + - "(null, null, null, null, 4.0, null, null, null)"); + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '2', '3'), " + + "('c_double', null, 2.0, 0.5, null, '3.4', '4.4'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('c_array', 96.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '8', '8'), " + + "(null, null, null, null, 4.0, null, null)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e1' AND p_bigint = 9)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 0.0, 0.0, null, null, null, null), " + - "('c_bigint', null, 0.0, 0.0, null, null, null, null), " + - "('c_double', null, 0.0, 0.0, null, null, null, null), " + - "('c_timestamp', null, 0.0, 0.0, null, null, null, null), " + - "('c_varchar', 0.0, 0.0, 0.0, null, null, null, null), " + - "('c_varbinary', 0.0, null, 0.0, null, null, null, null), " + - "('c_array', 0.0, null, 0.0, null, null, null, null), " + - "('p_varchar', 0.0, 0.0, 0.0, null, null, null, null), " + - "('p_bigint', null, 0.0, 0.0, null, null, null, null), " + - "(null, null, null, null, 0.0, null, null, null)"); + "('c_boolean', null, 0.0, 0.0, null, null, null), " + + "('c_bigint', null, 0.0, 0.0, null, null, null), " + + "('c_double', null, 0.0, 0.0, null, null, null), " + + "('c_timestamp', null, 0.0, 0.0, null, null, null), " + + "('c_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('c_varbinary', 0.0, null, 0.0, null, null, null), " + + "('c_array', 0.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e2' AND p_bigint = 9)", tableName), "SELECT * FROM VALUES " + - "('c_boolean', null, 0.0, 0.0, null, null, null, null), " + - "('c_bigint', null, 0.0, 0.0, null, null, null, null), " + - "('c_double', null, 0.0, 0.0, null, null, null, null), " + - "('c_timestamp', null, 0.0, 0.0, null, null, null, null), " + - "('c_varchar', 0.0, 0.0, 0.0, null, null, null, null), " + - "('c_varbinary', 0.0, null, 0.0, null, null, null, null), " + - "('c_array', 0.0, null, 0.0, null, null, null, null), " + - "('p_varchar', 0.0, 0.0, 0.0, null, null, null, null), " + - "('p_bigint', null, 0.0, 0.0, null, null, null, null), " + - "(null, null, null, null, 0.0, null, null, null)"); + "('c_boolean', null, 0.0, 0.0, null, null, null), " + + "('c_bigint', null, 0.0, 0.0, null, null, null), " + + "('c_double', null, 0.0, 0.0, null, null, null), " + + "('c_timestamp', null, 0.0, 0.0, null, null, null), " + + "('c_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('c_varbinary', 0.0, null, 0.0, null, null, null), " + + "('c_array', 0.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); // Drop the partitioned test table assertUpdate(format("DROP TABLE %s", tableName)); @@ -4840,32 +4840,32 @@ public void testAnalyzeUnpartitionedTable() // No column stats before running analyze assertQuery("SHOW STATS FOR " + tableName, "SELECT * FROM VALUES " + - "('c_boolean', null, null, null, null, null, null, null), " + - "('c_bigint', null, null, null, null, null, null, null), " + - "('c_double', null, null, null, null, null, null, null), " + - "('c_timestamp', null, null, null, null, null, null, null), " + - "('c_varchar', null, null, null, null, null, null, null), " + - "('c_varbinary', null, null, null, null, null, null, null), " + - "('c_array', null, null, null, null, null, null, null), " + - "('p_varchar', null, null, null, null, null, null, null), " + - "('p_bigint', null, null, null, null, null, null, null), " + - "(null, null, null, null, 16.0, null, null, null)"); + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('c_array', null, null, null, null, null, null), " + + "('p_varchar', null, null, null, null, null, null), " + + "('p_bigint', null, null, null, null, null, null), " + + "(null, null, null, null, 16.0, null, null)"); // Run analyze on the whole table assertUpdate("ANALYZE " + tableName, 16); assertQuery("SHOW STATS FOR " + tableName, "SELECT * FROM VALUES " + - "('c_boolean', null, 2.0, 0.375, null, null, null, null), " + - "('c_bigint', null, 8.0, 0.375, null, '0', '7', null), " + - "('c_double', null, 10.0, 0.375, null, '1.2', '7.7', null), " + - "('c_timestamp', null, 10.0, 0.375, null, null, null, null), " + - "('c_varchar', 40.0, 10.0, 0.375, null, null, null, null), " + - "('c_varbinary', 20.0, null, 0.375, null, null, null, null), " + - "('c_array', 560.0, null, 0.375, null, null, null, null), " + - "('p_varchar', 24.0, 3.0, 0.25, null, null, null, null), " + - "('p_bigint', null, 2.0, 0.25, null, '7', '8', null), " + - "(null, null, null, null, 16.0, null, null, null)"); + "('c_boolean', null, 2.0, 0.375, null, null, null), " + + "('c_bigint', null, 8.0, 0.375, null, '0', '7'), " + + "('c_double', null, 10.0, 0.375, null, '1.2', '7.7'), " + + "('c_timestamp', null, 10.0, 0.375, null, null, null), " + + "('c_varchar', 40.0, 10.0, 0.375, null, null, null), " + + "('c_varbinary', 20.0, null, 0.375, null, null, null), " + + "('c_array', 560.0, null, 0.375, null, null, null), " + + "('p_varchar', 24.0, 3.0, 0.25, null, null, null), " + + "('p_bigint', null, 2.0, 0.25, null, '7', '8'), " + + "(null, null, null, null, 16.0, null, null)"); // Drop the unpartitioned test table assertUpdate(format("DROP TABLE %s", tableName)); @@ -4950,11 +4950,11 @@ public void testInsertMultipleColumnsFromSameChannel() assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar_1 = '2' AND p_varchar_2 = '2')", tableName), "SELECT * FROM VALUES " + - "('c_bigint_1', null, 1.0E0, 0.0E0, null, '1', '1', null), " + - "('c_bigint_2', null, 1.0E0, 0.0E0, null, '1', '1', null), " + - "('p_varchar_1', 1.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "('p_varchar_2', 1.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 1.0E0, null, null, null)"); + "('c_bigint_1', null, 1.0E0, 0.0E0, null, '1', '1'), " + + "('c_bigint_2', null, 1.0E0, 0.0E0, null, '1', '1'), " + + "('p_varchar_1', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + + "('p_varchar_2', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 1.0E0, null, null)"); assertUpdate(format("" + "INSERT INTO %s (c_bigint_1, c_bigint_2, p_varchar_1, p_varchar_2) " + @@ -4964,11 +4964,11 @@ public void testInsertMultipleColumnsFromSameChannel() assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar_1 = 'O' AND p_varchar_2 = 'O')", tableName), "SELECT * FROM VALUES " + - "('c_bigint_1', null, 1.0E0, 0.0E0, null, '15008', '15008', null), " + - "('c_bigint_2', null, 1.0E0, 0.0E0, null, '15008', '15008', null), " + - "('p_varchar_1', 1.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "('p_varchar_2', 1.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 1.0E0, null, null, null)"); + "('c_bigint_1', null, 1.0E0, 0.0E0, null, '15008', '15008'), " + + "('c_bigint_2', null, 1.0E0, 0.0E0, null, '15008', '15008'), " + + "('p_varchar_1', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + + "('p_varchar_2', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 1.0E0, null, null)"); assertUpdate(format("DROP TABLE %s", tableName)); } diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestParquetDistributedQueries.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestParquetDistributedQueries.java index d8bb271cebcd0..e2e3cab39fa31 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestParquetDistributedQueries.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestParquetDistributedQueries.java @@ -73,22 +73,22 @@ public void testQuickStats() // Since no stats were collected during write, all column stats will be null assertQuery("SHOW STATS FOR test_quick_stats", "SELECT * FROM (VALUES " + - " ('orderkey', null, null, null, null, null, null, null), " + - " ('linenumber', null, null, null, null, null, null, null), " + - " ('shipdate', null, null, null, null, null, null, null), " + - " ('arr', null, null, null, null, null, null, null), " + - " ('rrow', null, null, null, null, null, null, null), " + - " (null, null, null, null, 60175.0, null, null, null))"); + " ('orderkey', null, null, null, null, null, null), " + + " ('linenumber', null, null, null, null, null, null), " + + " ('shipdate', null, null, null, null, null, null), " + + " ('arr', null, null, null, null, null, null), " + + " ('rrow', null, null, null, null, null, null), " + + " (null, null, null, null, 60175.0, null, null))"); // With quick stats enabled, we should get nulls_fraction, low_value and high_value for the non-nested columns assertQuery(session, "SHOW STATS FOR test_quick_stats", "SELECT * FROM (VALUES " + - " ('orderkey', null, null, 0.0, null, '1', '60000', null), " + - " ('linenumber', null, null, 0.0, null, '1', '7', null), " + - " ('shipdate', null, null, 0.0, null, '1992-01-04', '1998-11-29', null), " + - " ('arr', null, null, null, null, null, null, null), " + - " ('rrow', null, null, null, null, null, null, null), " + - " (null, null, null, null, 60175.0, null, null, null))"); + " ('orderkey', null, null, 0.0, null, '1', '60000'), " + + " ('linenumber', null, null, 0.0, null, '1', '7'), " + + " ('shipdate', null, null, 0.0, null, '1992-01-04', '1998-11-29'), " + + " ('arr', null, null, null, null, null, null), " + + " ('rrow', null, null, null, null, null, null), " + + " (null, null, null, null, 60175.0, null, null))"); } finally { getQueryRunner().execute("DROP TABLE test_quick_stats"); diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestShowStats.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestShowStats.java index 2969c5a12f7f0..07274f6ea61c8 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestShowStats.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestShowStats.java @@ -44,97 +44,97 @@ public void testShowStats() { assertQuery("SHOW STATS FOR nation_partitioned", "SELECT * FROM (VALUES " + - " ('regionkey', null, 5.0, 0.0, null, 0, 4, null), " + - " ('nationkey', null, 5.0, 0.0, null, 0, 24, null), " + - " ('name', 177.0, 5.0, 0.0, null, null, null, null), " + - " ('comment', 1857.0, 5.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 25.0, null, null, null))"); + " ('regionkey', null, 5.0, 0.0, null, 0, 4), " + + " ('nationkey', null, 5.0, 0.0, null, 0, 24), " + + " ('name', 177.0, 5.0, 0.0, null, null, null), " + + " ('comment', 1857.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 25.0, null, null))"); assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned)", "SELECT * FROM (VALUES " + - " ('regionkey', null, 5.0, 0.0, null, 0, 4, null), " + - " ('nationkey', null, 5.0, 0.0, null, 0, 24, null), " + - " ('name', 177.0, 5.0, 0.0, null, null, null, null), " + - " ('comment', 1857.0, 5.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 25.0, null, null, null))"); + " ('regionkey', null, 5.0, 0.0, null, 0, 4), " + + " ('nationkey', null, 5.0, 0.0, null, 0, 24), " + + " ('name', 177.0, 5.0, 0.0, null, null, null), " + + " ('comment', 1857.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 25.0, null, null))"); assertQuery("SHOW STATS FOR (SELECT regionkey, name FROM nation_partitioned)", "SELECT * FROM (VALUES " + - " ('regionkey', null, 5.0, 0.0, null, 0, 4, null), " + - " ('name', 177.0, 5.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 25.0, null, null, null))"); + " ('regionkey', null, 5.0, 0.0, null, 0, 4), " + + " ('name', 177.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 25.0, null, null))"); assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey IS NOT NULL)", "SELECT * FROM (VALUES " + - " ('regionkey', null, 5.0, 0.0, null, 0, 4, null), " + - " ('nationkey', null, 5.0, 0.0, null, 0, 24, null), " + - " ('name', 177.0, 5.0, 0.0, null, null, null, null), " + - " ('comment', 1857.0, 5.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 25.0, null, null, null))"); + " ('regionkey', null, 5.0, 0.0, null, 0, 4), " + + " ('nationkey', null, 5.0, 0.0, null, 0, 24), " + + " ('name', 177.0, 5.0, 0.0, null, null, null), " + + " ('comment', 1857.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 25.0, null, null))"); assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey IS NULL)", "SELECT * FROM (VALUES " + - " ('regionkey', null, 0.0, 0.0, null, null, null, null), " + - " ('nationkey', null, 0.0, 0.0, null, null, null, null), " + - " ('name', 0.0, 0.0, 0.0, null, null, null, null), " + - " ('comment', 0.0, 0.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 0.0, null, null, null))"); + " ('regionkey', null, 0.0, 0.0, null, null, null), " + + " ('nationkey', null, 0.0, 0.0, null, null, null), " + + " ('name', 0.0, 0.0, 0.0, null, null, null), " + + " ('comment', 0.0, 0.0, 0.0, null, null, null), " + + " (null, null, null, null, 0.0, null, null))"); assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey = 1)", "SELECT * FROM (VALUES " + - " ('regionkey', null, 1.0, 0.0, null, 1, 1, null), " + - " ('nationkey', null, 5.0, 0.0, null, 1, 24, null), " + - " ('name', 38.0, 5.0, 0.0, null, null, null, null), " + - " ('comment', 500.0, 5.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 5.0, null, null, null))"); + " ('regionkey', null, 1.0, 0.0, null, 1, 1), " + + " ('nationkey', null, 5.0, 0.0, null, 1, 24), " + + " ('name', 38.0, 5.0, 0.0, null, null, null), " + + " ('comment', 500.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 5.0, null, null))"); assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey IN (1, 3))", "SELECT * FROM (VALUES " + - " ('regionkey', null, 2.0, 0.0, null, 1, 3, null), " + - " ('nationkey', null, 5.0, 0.0, null, 1, 24, null), " + - " ('name', 78.0, 5.0, 0.0, null, null, null, null), " + - " ('comment', 847.0, 5.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 10.0, null, null, null))"); + " ('regionkey', null, 2.0, 0.0, null, 1, 3), " + + " ('nationkey', null, 5.0, 0.0, null, 1, 24), " + + " ('name', 78.0, 5.0, 0.0, null, null, null), " + + " ('comment', 847.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 10.0, null, null))"); assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey BETWEEN 1 AND 1 + 2)", "SELECT * FROM (VALUES " + - " ('regionkey', null, 3.0, 0.0, null, 1, 3, null), " + - " ('nationkey', null, 5.0, 0.0, null, 1, 24, null), " + - " ('name', 109.0, 5.0, 0.0, null, null, null, null), " + - " ('comment', 1199.0, 5.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 15.0, null, null, null))"); + " ('regionkey', null, 3.0, 0.0, null, 1, 3), " + + " ('nationkey', null, 5.0, 0.0, null, 1, 24), " + + " ('name', 109.0, 5.0, 0.0, null, null, null), " + + " ('comment', 1199.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 15.0, null, null))"); assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey > 3)", "SELECT * FROM (VALUES " + - " ('regionkey', null, 1.0, 0.0, null, 4, 4, null), " + - " ('nationkey', null, 5.0, 0.0, null, 4, 20, null), " + - " ('name', 31.0, 5.0, 0.0, null, null, null, null), " + - " ('comment', 348.0, 5.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 5.0, null, null, null))"); + " ('regionkey', null, 1.0, 0.0, null, 4, 4), " + + " ('nationkey', null, 5.0, 0.0, null, 4, 20), " + + " ('name', 31.0, 5.0, 0.0, null, null, null), " + + " ('comment', 348.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 5.0, null, null))"); assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey < 1)", "SELECT * FROM (VALUES " + - " ('regionkey', null, 1.0, 0.0, null, 0, 0, null), " + - " ('nationkey', null, 5.0, 0.0, null, 0, 16, null), " + - " ('name', 37.0, 5.0, 0.0, null, null, null, null), " + - " ('comment', 310.0, 5.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 5.0, null, null, null))"); + " ('regionkey', null, 1.0, 0.0, null, 0, 0), " + + " ('nationkey', null, 5.0, 0.0, null, 0, 16), " + + " ('name', 37.0, 5.0, 0.0, null, null, null), " + + " ('comment', 310.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 5.0, null, null))"); assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey > 0 and regionkey < 4)", "SELECT * FROM (VALUES " + - " ('regionkey', null, 3.0, 0.0, null, 1, 3, null), " + - " ('nationkey', null, 5.0, 0.0, null, 1, 24, null), " + - " ('name', 109.0, 5.0, 0.0, null, null, null, null), " + - " ('comment', 1199.0, 5.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 15.0, null, null, null))"); + " ('regionkey', null, 3.0, 0.0, null, 1, 3), " + + " ('nationkey', null, 5.0, 0.0, null, 1, 24), " + + " ('name', 109.0, 5.0, 0.0, null, null, null), " + + " ('comment', 1199.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 15.0, null, null))"); assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey > 10 or regionkey < 0)", "SELECT * FROM (VALUES " + - " ('regionkey', null, 0.0, 0.0, null, null, null, null), " + - " ('nationkey', null, 0.0, 0.0, null, null, null, null), " + - " ('name', 0.0, 0.0, 0.0, null, null, null, null), " + - " ('comment', 0.0, 0.0, 0.0, null, null, null, null), " + - " (null, null, null, null, 0.0, null, null, null))"); + " ('regionkey', null, 0.0, 0.0, null, null, null), " + + " ('nationkey', null, 0.0, 0.0, null, null, null), " + + " ('name', 0.0, 0.0, 0.0, null, null, null), " + + " ('comment', 0.0, 0.0, 0.0, null, null, null), " + + " (null, null, null, null, 0.0, null, null))"); } @Test diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedSmokeTestBase.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedSmokeTestBase.java index db2a1582aaf8f..9e8a756ae89d8 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedSmokeTestBase.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedSmokeTestBase.java @@ -995,21 +995,21 @@ public void testBasicTableStatistics() assertQuery(session, "SHOW STATS FOR " + tableName, "VALUES " + - " ('col', null, null, null, NULL, NULL, NULL, NULL), " + - " (NULL, NULL, NULL, NULL, 0e0, NULL, NULL, NULL)"); + " ('col', null, null, null, NULL, NULL, NULL), " + + " (NULL, NULL, NULL, NULL, 0e0, NULL, NULL)"); assertUpdate("INSERT INTO " + tableName + " VALUES -10", 1); assertUpdate("INSERT INTO " + tableName + " VALUES 100", 1); assertQuery(session, "SHOW STATS FOR " + tableName, "VALUES " + - " ('col', NULL, NULL, 0.0, NULL, '-10.0', '100.0', NULL), " + - " (NULL, NULL, NULL, NULL, 2e0, NULL, NULL, NULL)"); + " ('col', NULL, NULL, 0.0, NULL, '-10.0', '100.0'), " + + " (NULL, NULL, NULL, NULL, 2e0, NULL, NULL)"); assertUpdate("INSERT INTO " + tableName + " VALUES 200", 1); assertQuery(session, "SHOW STATS FOR " + tableName, "VALUES " + - " ('col', NULL, NULL, 0.0, NULL, '-10.0', '200.0', NULL), " + - " (NULL, NULL, NULL, NULL, 3e0, NULL, NULL, NULL)"); + " ('col', NULL, NULL, 0.0, NULL, '-10.0', '200.0'), " + + " (NULL, NULL, NULL, NULL, 3e0, NULL, NULL)"); dropTable(session, tableName); } @@ -1157,16 +1157,16 @@ public void testTableStatisticsTimestamp() assertQuery(session, "SHOW STATS FOR " + tableName, "VALUES " + - " ('col', null, null, null, NULL, NULL, NULL, NULL), " + - " (NULL, NULL, NULL, NULL, 0e0, NULL, NULL, NULL)"); + " ('col', null, null, null, NULL, NULL, NULL), " + + " (NULL, NULL, NULL, NULL, 0e0, NULL, NULL)"); assertUpdate(session, "INSERT INTO " + tableName + " VALUES TIMESTAMP '2021-01-02 09:04:05.321'", 1); assertUpdate(session, "INSERT INTO " + tableName + " VALUES TIMESTAMP '2022-12-22 10:07:08.456'", 1); assertQuery(session, "SHOW STATS FOR " + tableName, "VALUES " + - " ('col', NULL, NULL, 0.0, NULL, '2021-01-02 09:04:05.321', '2022-12-22 10:07:08.456', NULL), " + - " (NULL, NULL, NULL, NULL, 2e0, NULL, NULL, NULL)"); + " ('col', NULL, NULL, 0.0, NULL, '2021-01-02 09:04:05.321', '2022-12-22 10:07:08.456'), " + + " (NULL, NULL, NULL, NULL, 2e0, NULL, NULL)"); dropTable(session, tableName); } diff --git a/presto-main/src/main/java/com/facebook/presto/SystemSessionProperties.java b/presto-main/src/main/java/com/facebook/presto/SystemSessionProperties.java index 0a26d1dedfe72..01d3432042e4e 100644 --- a/presto-main/src/main/java/com/facebook/presto/SystemSessionProperties.java +++ b/presto-main/src/main/java/com/facebook/presto/SystemSessionProperties.java @@ -345,7 +345,6 @@ public final class SystemSessionProperties public static final String NATIVE_EXECUTION_PROCESS_REUSE_ENABLED = "native_execution_process_reuse_enabled"; public static final String NATIVE_DEBUG_VALIDATE_OUTPUT_FROM_OPERATORS = "native_debug_validate_output_from_operators"; public static final String DEFAULT_VIEW_SECURITY_MODE = "default_view_security_mode"; - public static final String OPTIMIZER_USE_HISTOGRAMS = "optimizer_use_histograms"; private final List> sessionProperties; @@ -1909,7 +1908,7 @@ public SystemSessionProperties( GENERATE_DOMAIN_FILTERS, "Infer predicates from column domains during predicate pushdown", featuresConfig.getGenerateDomainFilters(), - false), + false), booleanProperty( REWRITE_EXPRESSION_WITH_CONSTANT_EXPRESSION, "Rewrite left join with is null check to semi join", @@ -1926,11 +1925,7 @@ public SystemSessionProperties( featuresConfig.getDefaultViewSecurityMode(), false, value -> CreateView.Security.valueOf(((String) value).toUpperCase()), - CreateView.Security::name), - booleanProperty(OPTIMIZER_USE_HISTOGRAMS, - "whether or not to use histograms in the CBO", - featuresConfig.isUseHistograms(), - false)); + CreateView.Security::name)); } public static boolean isSpoolingOutputBufferEnabled(Session session) @@ -3212,9 +3207,4 @@ public static CreateView.Security getDefaultViewSecurityMode(Session session) { return session.getSystemProperty(DEFAULT_VIEW_SECURITY_MODE, CreateView.Security.class); } - - public static boolean shouldOptimizerUseHistograms(Session session) - { - return session.getSystemProperty(OPTIMIZER_USE_HISTOGRAMS, Boolean.class); - } } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java b/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java index 8931cec0cd115..f695c5abb8dce 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java @@ -13,17 +13,12 @@ */ package com.facebook.presto.cost; -import com.facebook.airlift.log.Logger; -import com.facebook.presto.Session; -import com.facebook.presto.SystemSessionProperties; import com.facebook.presto.spi.relation.VariableReferenceExpression; -import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.sql.tree.ComparisonExpression; import java.util.Optional; import java.util.OptionalDouble; -import static com.facebook.presto.cost.FilterStatsCalculator.UNKNOWN_FILTER_COEFFICIENT; import static com.facebook.presto.cost.VariableStatsEstimate.buildFrom; import static com.facebook.presto.util.MoreMath.firstNonNaN; import static com.facebook.presto.util.MoreMath.max; @@ -36,8 +31,6 @@ public final class ComparisonStatsCalculator { - private static final Logger log = Logger.get(ComparisonStatsCalculator.class); - private ComparisonStatsCalculator() {} public static PlanNodeStatsEstimate estimateExpressionToLiteralComparison( @@ -45,22 +38,19 @@ public static PlanNodeStatsEstimate estimateExpressionToLiteralComparison( VariableStatsEstimate expressionStatistics, Optional expressionVariable, OptionalDouble literalValue, - ComparisonExpression.Operator operator, - Optional session) + ComparisonExpression.Operator operator) { switch (operator) { case EQUAL: - return estimateExpressionEqualToLiteral(inputStatistics, expressionStatistics, expressionVariable, literalValue, session); + return estimateExpressionEqualToLiteral(inputStatistics, expressionStatistics, expressionVariable, literalValue); case NOT_EQUAL: - return estimateExpressionNotEqualToLiteral(inputStatistics, expressionStatistics, expressionVariable, literalValue, session); + return estimateExpressionNotEqualToLiteral(inputStatistics, expressionStatistics, expressionVariable, literalValue); case LESS_THAN: - return estimateExpressionLessThanLiteral(inputStatistics, expressionStatistics, expressionVariable, literalValue, false, session); case LESS_THAN_OR_EQUAL: - return estimateExpressionLessThanLiteral(inputStatistics, expressionStatistics, expressionVariable, literalValue, true, session); + return estimateExpressionLessThanLiteral(inputStatistics, expressionStatistics, expressionVariable, literalValue); case GREATER_THAN: - return estimateExpressionGreaterThanLiteral(inputStatistics, expressionStatistics, expressionVariable, literalValue, false, session); case GREATER_THAN_OR_EQUAL: - return estimateExpressionGreaterThanLiteral(inputStatistics, expressionStatistics, expressionVariable, literalValue, true, session); + return estimateExpressionGreaterThanLiteral(inputStatistics, expressionStatistics, expressionVariable, literalValue); case IS_DISTINCT_FROM: return PlanNodeStatsEstimate.unknown(); default: @@ -72,34 +62,35 @@ private static PlanNodeStatsEstimate estimateExpressionEqualToLiteral( PlanNodeStatsEstimate inputStatistics, VariableStatsEstimate expressionStatistics, Optional expressionVariable, - OptionalDouble literalValue, - Optional session) + OptionalDouble literalValue) { StatisticRange filterRange; if (literalValue.isPresent()) { - filterRange = new StatisticRange(literalValue.getAsDouble(), false, literalValue.getAsDouble(), false, 1); + filterRange = new StatisticRange(literalValue.getAsDouble(), literalValue.getAsDouble(), 1); } else { filterRange = new StatisticRange(NEGATIVE_INFINITY, POSITIVE_INFINITY, 1); } - return estimateFilterRange(inputStatistics, expressionStatistics, expressionVariable, filterRange, session); + return estimateFilterRange(inputStatistics, expressionStatistics, expressionVariable, filterRange); } private static PlanNodeStatsEstimate estimateExpressionNotEqualToLiteral( PlanNodeStatsEstimate inputStatistics, VariableStatsEstimate expressionStatistics, Optional expressionVariable, - OptionalDouble literalValue, - Optional session) + OptionalDouble literalValue) { + StatisticRange expressionRange = StatisticRange.from(expressionStatistics); + StatisticRange filterRange; if (literalValue.isPresent()) { - filterRange = new StatisticRange(literalValue.getAsDouble(), false, literalValue.getAsDouble(), false, 1); + filterRange = new StatisticRange(literalValue.getAsDouble(), literalValue.getAsDouble(), 1); } else { - filterRange = new StatisticRange(NEGATIVE_INFINITY, true, POSITIVE_INFINITY, true, 1); + filterRange = new StatisticRange(NEGATIVE_INFINITY, POSITIVE_INFINITY, 1); } - double filterFactor = 1 - calculateFilterFactor(expressionStatistics, filterRange, session); + StatisticRange intersectRange = expressionRange.intersect(filterRange); + double filterFactor = 1 - expressionRange.overlapPercentWith(intersectRange); PlanNodeStatsEstimate.Builder estimate = PlanNodeStatsEstimate.buildFrom(inputStatistics); estimate.setOutputRowCount(filterFactor * (1 - expressionStatistics.getNullsFraction()) * inputStatistics.getOutputRowCount()); @@ -117,37 +108,33 @@ private static PlanNodeStatsEstimate estimateExpressionLessThanLiteral( PlanNodeStatsEstimate inputStatistics, VariableStatsEstimate expressionStatistics, Optional expressionVariable, - OptionalDouble literalValue, - boolean equals, - Optional session) + OptionalDouble literalValue) { - StatisticRange filterRange = new StatisticRange(NEGATIVE_INFINITY, true, literalValue.orElse(POSITIVE_INFINITY), !equals, NaN); - return estimateFilterRange(inputStatistics, expressionStatistics, expressionVariable, filterRange, session); + StatisticRange filterRange = new StatisticRange(NEGATIVE_INFINITY, literalValue.orElse(POSITIVE_INFINITY), NaN); + return estimateFilterRange(inputStatistics, expressionStatistics, expressionVariable, filterRange); } private static PlanNodeStatsEstimate estimateExpressionGreaterThanLiteral( PlanNodeStatsEstimate inputStatistics, VariableStatsEstimate expressionStatistics, Optional expressionVariable, - OptionalDouble literalValue, - boolean equals, - Optional session) + OptionalDouble literalValue) { - StatisticRange filterRange = new StatisticRange(literalValue.orElse(NEGATIVE_INFINITY), !equals, POSITIVE_INFINITY, true, NaN); - return estimateFilterRange(inputStatistics, expressionStatistics, expressionVariable, filterRange, session); + StatisticRange filterRange = new StatisticRange(literalValue.orElse(NEGATIVE_INFINITY), POSITIVE_INFINITY, NaN); + return estimateFilterRange(inputStatistics, expressionStatistics, expressionVariable, filterRange); } private static PlanNodeStatsEstimate estimateFilterRange( PlanNodeStatsEstimate inputStatistics, VariableStatsEstimate expressionStatistics, Optional expressionVariable, - StatisticRange filterRange, - Optional session) + StatisticRange filterRange) { - double filterFactor = calculateFilterFactor(expressionStatistics, filterRange, session); - StatisticRange expressionRange = StatisticRange.from(expressionStatistics); StatisticRange intersectRange = expressionRange.intersect(filterRange); + + double filterFactor = expressionRange.overlapPercentWith(intersectRange); + PlanNodeStatsEstimate estimate = inputStatistics.mapOutputRowCount(rowCount -> filterFactor * (1 - expressionStatistics.getNullsFraction()) * rowCount); if (expressionVariable.isPresent()) { VariableStatsEstimate symbolNewEstimate = @@ -155,41 +142,12 @@ private static PlanNodeStatsEstimate estimateFilterRange( .setAverageRowSize(expressionStatistics.getAverageRowSize()) .setStatisticsRange(intersectRange) .setNullsFraction(0.0) - .setHistogram(DisjointRangeDomainHistogram.addConjunction(expressionStatistics.getHistogram(), intersectRange)) .build(); estimate = estimate.mapVariableColumnStatistics(expressionVariable.get(), oldStats -> symbolNewEstimate); } return estimate; } - private static double calculateFilterFactor(VariableStatsEstimate variableStatistics, StatisticRange filterRange, Optional session) - { - StatisticRange variableRange = StatisticRange.from(variableStatistics); - StatisticRange intersectRange = variableRange.intersect(filterRange); - Estimate filterEstimate; - if (session.map(SystemSessionProperties::shouldOptimizerUseHistograms).orElse(false)) { - Estimate distinctEstimate = isNaN(variableStatistics.getDistinctValuesCount()) ? Estimate.unknown() : Estimate.of(variableRange.getDistinctValuesCount()); - filterEstimate = HistogramCalculator.calculateFilterFactor(intersectRange, variableStatistics.getHistogram(), distinctEstimate, true); - if (log.isDebugEnabled()) { - double expressionFilter = variableRange.overlapPercentWith(intersectRange); - if (!Double.isNaN(expressionFilter) && - !filterEstimate.fuzzyEquals(Estimate.of(expressionFilter), .0001)) { - log.debug(String.format("histogram-calculated filter factor differs from the uniformity assumption:" + - "expression range: %s%n" + - "intersect range: %s%n" + - "overlapPercent: %s%n" + - "histogram: %s%n" + - "histogramFilterIntersect: %s%n", variableRange, intersectRange, expressionFilter, variableStatistics.getHistogram(), filterEstimate)); - } - } - } - else { - filterEstimate = Estimate.estimateFromDouble(variableRange.overlapPercentWith(intersectRange)); - } - - return filterEstimate.orElse(() -> UNKNOWN_FILTER_COEFFICIENT); - } - public static PlanNodeStatsEstimate estimateExpressionToExpressionComparison( PlanNodeStatsEstimate inputStatistics, VariableStatsEstimate leftExpressionStatistics, diff --git a/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java b/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java index c44a5be61f147..2cdf660ce4b77 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java @@ -14,8 +14,6 @@ package com.facebook.presto.cost; -import com.facebook.presto.FullConnectorSession; -import com.facebook.presto.Session; import com.facebook.presto.common.type.Type; import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.ConnectorSession; @@ -55,12 +53,7 @@ public TableStatistics filterStats( Map columnTypes) { PlanNodeStatsEstimate tableStats = toPlanNodeStats(tableStatistics, columnNames, columnTypes); - // TODO: Consider re-designing the filter calculator APIs so that a proper Session instance - // can be more easily populated - Optional filterSession = session instanceof FullConnectorSession ? - Optional.of(((FullConnectorSession) session).getSession()) : - Optional.empty(); - PlanNodeStatsEstimate filteredStats = filterStatsCalculator.filterStats(tableStats, predicate, session, filterSession); + PlanNodeStatsEstimate filteredStats = filterStatsCalculator.filterStats(tableStats, predicate, session); if (filteredStats.isOutputRowCountUnknown()) { filteredStats = tableStats.mapOutputRowCount(sourceRowCount -> tableStats.getOutputRowCount() * UNKNOWN_FILTER_COEFFICIENT); diff --git a/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java b/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java deleted file mode 100644 index b602fb8de4f93..0000000000000 --- a/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.cost; - -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.common.collect.BoundType; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Range; -import com.google.common.collect.RangeSet; -import com.google.common.collect.TreeRangeSet; - -import java.util.Collection; -import java.util.HashSet; -import java.util.NoSuchElementException; -import java.util.Objects; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; - -import static com.facebook.presto.cost.HistogramCalculator.calculateFilterFactor; -import static com.facebook.presto.util.MoreMath.max; -import static com.facebook.presto.util.MoreMath.min; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static java.lang.Double.NEGATIVE_INFINITY; -import static java.lang.Double.POSITIVE_INFINITY; -import static java.lang.Double.isFinite; -import static java.util.Objects.hash; -import static java.util.Objects.requireNonNull; - -/** - * This class represents a set of disjoint ranges that span an input domain. - * Each range is used to represent filters over the domain of an original - * "source" histogram. - *
- * For example, assume a source histogram represents a uniform distribution - * over the range [0, 100]. Next, assume a query with multiple filters such as - * x < 10 OR x > 75. This translates to two disjoint ranges over - * the histogram of [0, 10) and (75, 100], representing roughly 35% of the - * values in the original dataset. Using the example above, a cumulative - * probability for value 5 represents 5% of the original dataset, but 20% (1/5) - * of the range of constrained dataset. Similarly, all values in [10, 75] should - * compute their cumulative probability as 40% (2/5). - *
- * The goal of this class is to implement the {@link ConnectorHistogram} API - * given a source histogram whose domain has been constrained by a set of filter - * ranges. - *
- * This class is intended to be immutable. Changing the set of ranges should - * result in a new copy being created. - */ -public class DisjointRangeDomainHistogram - implements ConnectorHistogram -{ - private final ConnectorHistogram source; - // use RangeSet as the internal representation of the ranges, but the constructor arguments - // use StatisticRange to support serialization and deserialization. - private final RangeSet rangeSet; - private final Range sourceSpan; - - @JsonCreator - public DisjointRangeDomainHistogram(@JsonProperty("source") ConnectorHistogram source, @JsonProperty("ranges") Collection ranges) - { - this(source, ranges.stream().map(StatisticRange::toRange).collect(Collectors.toSet())); - } - - public DisjointRangeDomainHistogram(ConnectorHistogram source, Iterable> ranges) - { - this.source = requireNonNull(source, "source is null"); - this.sourceSpan = getSourceSpan(source); - this.rangeSet = TreeRangeSet.create(ranges).subRangeSet(sourceSpan); - } - - private static Range getSourceSpan(ConnectorHistogram source) - { - return Range.closed( - source.inverseCumulativeProbability(0.0).orElse(() -> NEGATIVE_INFINITY), - source.inverseCumulativeProbability(1.0).orElse(() -> POSITIVE_INFINITY)); - } - - @JsonProperty - public ConnectorHistogram getSource() - { - return source; - } - - @JsonProperty - public Set getRanges() - { - return rangeSet.asRanges().stream().map(StatisticRange::fromRange).collect(Collectors.toSet()); - } - - public DisjointRangeDomainHistogram(ConnectorHistogram source) - { - this(source, ImmutableSet.>of()); - } - - @Override - public Estimate cumulativeProbability(double value, boolean inclusive) - { - // 1. compute the total probability for every existing range on the source - // 2. find the range, r, where `value` falls - // 3. compute the cumulative probability across all ranges that intersect [min, value] - // 4. divide the result from (3) by the result from (1) to get the true cumulative - // probability of the disjoint domains over the original histogram - if (Double.isNaN(value)) { - return Estimate.unknown(); - } - Optional> optionalSpan = getSpan(); - if (!optionalSpan.isPresent()) { - return Estimate.of(0.0); - } - Range span = optionalSpan.get(); - if (value <= span.lowerEndpoint()) { - return Estimate.of(0.0); - } - Range input = Range.range(span.lowerEndpoint(), span.lowerBoundType(), value, inclusive ? BoundType.CLOSED : BoundType.OPEN); - Estimate fullSetOverlap = calculateRangeSetOverlap(rangeSet); - RangeSet spanned = rangeSet.subRangeSet(input); - Estimate spannedOverlap = calculateRangeSetOverlap(spanned); - - return spannedOverlap.flatMap(spannedProbability -> - fullSetOverlap.map(fullSetProbability -> { - if (fullSetProbability == 0.0) { - return 0.0; - } - return min(spannedProbability / fullSetProbability, 1.0); - })); - } - - private Estimate calculateRangeSetOverlap(RangeSet ranges) - { - // we require knowing bounds on all ranges - double cumulativeTotal = 0.0; - for (Range range : ranges.asRanges()) { - Estimate rangeProbability = getRangeProbability(range); - if (rangeProbability.isUnknown()) { - return Estimate.unknown(); - } - cumulativeTotal += rangeProbability.getValue(); - } - return Estimate.of(cumulativeTotal); - } - - /** - * Calculates the percent of the source distribution that {@code range} - * spans. - * - * @param range the range over the source domain - * @return estimate of the total probability the range covers in the source - */ - private Estimate getRangeProbability(Range range) - { - return calculateFilterFactor(StatisticRange.fromRange(range), source, Estimate.unknown(), false); - } - - @Override - public Estimate inverseCumulativeProbability(double percentile) - { - checkArgument(percentile >= 0.0 && percentile <= 1.0, "percentile must fall in [0.0, 1.0]"); - // 1. compute the probability for each range on the source in order until reaching a range - // where the cumulative total exceeds the percentile argument (totalCumulative) - // 2. compute the source probability of the left endpoint of the given range (percentileLow) - // 3. compute the "true" source percentile: - // rangedPercentile = percentile - percentileLow - // - // percentileLow + (rangedPercentile * rangePercentileLength) - Optional> optionalSpan = getSpan(); - if (!optionalSpan.isPresent()) { - return Estimate.unknown(); - } - Range span = optionalSpan.get(); - if (percentile == 0.0 && isFinite(span.lowerEndpoint())) { - return source.inverseCumulativeProbability(0.0).map(sourceMin -> max(span.lowerEndpoint(), sourceMin)); - } - - if (percentile == 1.0 && isFinite(span.upperEndpoint())) { - return source.inverseCumulativeProbability(1.0).map(sourceMax -> min(span.upperEndpoint(), sourceMax)); - } - - Estimate totalCumulativeEstimate = calculateRangeSetOverlap(rangeSet); - if (totalCumulativeEstimate.isUnknown()) { - return Estimate.unknown(); - } - double totalCumulativeProbabilitySourceDomain = totalCumulativeEstimate.getValue(); - if (totalCumulativeProbabilitySourceDomain == 0.0) { - // calculations will fail with NaN - return Estimate.unknown(); - } - double cumulativeProbabilityNewDomain = 0.0; - double lastRangeEstimateSourceDomain = 0.0; - Range currentRange = null; - // find the range where the percentile falls - for (Range range : rangeSet.asRanges()) { - Estimate rangeEstimate = getRangeProbability(range); - if (rangeEstimate.isUnknown()) { - return Estimate.unknown(); - } - currentRange = range; - lastRangeEstimateSourceDomain = rangeEstimate.getValue(); - cumulativeProbabilityNewDomain += lastRangeEstimateSourceDomain / totalCumulativeProbabilitySourceDomain; - if (cumulativeProbabilityNewDomain >= percentile) { - break; - } - } - if (currentRange == null) { - // no ranges to iterate over. Did a constraint cut the entire domain of values? - return Estimate.unknown(); - } - Estimate rangeLeftSourceEstimate = source.cumulativeProbability(currentRange.lowerEndpoint(), currentRange.lowerBoundType() == BoundType.OPEN); - if (rangeLeftSourceEstimate.isUnknown()) { - return Estimate.unknown(); - } - double rangeLeftSource = rangeLeftSourceEstimate.getValue(); - double lastRangeProportionalProbability = lastRangeEstimateSourceDomain / totalCumulativeProbabilitySourceDomain; - double percentileLeftFromNewDomain = percentile - cumulativeProbabilityNewDomain + lastRangeProportionalProbability; - double percentilePoint = lastRangeEstimateSourceDomain * percentileLeftFromNewDomain / lastRangeProportionalProbability; - double finalPercentile = rangeLeftSource + percentilePoint; - - return source.inverseCumulativeProbability(min(max(finalPercentile, 0.0), 1.0)); - } - - /** - * Adds a new domain (logical disjunction) to the existing set. - * - * @param other the new range to add to the set. - * @return a new {@link DisjointRangeDomainHistogram} - */ - public DisjointRangeDomainHistogram addDisjunction(StatisticRange other) - { - Set> ranges = new HashSet<>(rangeSet.asRanges()); - ranges.add(other.toRange()); - return new DisjointRangeDomainHistogram(source, ranges); - } - - /** - * Adds a constraint (logical conjunction). This will constrain all ranges - * in the set to ones that are contained by the argument range. - * - * @param other the range that should enclose the set. - * @return a new {@link DisjointRangeDomainHistogram} where - */ - public DisjointRangeDomainHistogram addConjunction(StatisticRange other) - { - return new DisjointRangeDomainHistogram(source, rangeSet.subRangeSet(other.toRange()).asRanges()); - } - - /** - * Adds a new range to the available ranges that this histogram computes over - *
- * e.g. if the source histogram represents values [0, 100], and an existing - * range in the set constrains it to [0, 25], and this method is called with - * a range of [50, 75], then it will attempt to push [50, 75] down onto the - * existing histogram to expand the set of intervals that are used to - * computed probabilities to [[0, 25], [50, 75]]. - *
- * This method should be called for cases where we want to calculate plan - * statistics for queries that have multiple filters combined with OR. - * - * @param histogram the source histogram to add the range conjunction - * @param range the range representing the conjunction to add - * @return a new histogram with the conjunction applied. - */ - public static ConnectorHistogram addDisjunction(ConnectorHistogram histogram, StatisticRange range) - { - if (histogram instanceof DisjointRangeDomainHistogram) { - return ((DisjointRangeDomainHistogram) histogram).addDisjunction(range); - } - - return new DisjointRangeDomainHistogram(histogram, ImmutableSet.of(range.toRange())); - } - - /** - * Similar to {@link #addDisjunction(ConnectorHistogram, StatisticRange)} this method constrains - * the entire domain such that all ranges in the set intersect with the given range - * argument to this method. - *
- * This should be used when an AND clause is present in the query and all tuples MUST satisfy - * the condition. - * - * @param histogram the source histogram - * @param range the range of values that the entire histogram's domain must fall within - * @return a histogram with the new range constraint - */ - public static ConnectorHistogram addConjunction(ConnectorHistogram histogram, StatisticRange range) - { - if (histogram instanceof DisjointRangeDomainHistogram) { - return ((DisjointRangeDomainHistogram) histogram).addConjunction(range); - } - - return new DisjointRangeDomainHistogram(histogram, ImmutableSet.of(range.toRange())); - } - - /** - * @return the span if it exists, empty otherwise - */ - private Optional> getSpan() - { - try { - return Optional.of(rangeSet.span()); - } - catch (NoSuchElementException e) { - return Optional.empty(); - } - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("source", this.source) - .add("sourceSpan", this.sourceSpan) - .add("rangeSet", this.rangeSet) - .toString(); - } - - @Override - public boolean equals(Object o) - { - if (o == this) { - return true; - } - if (!(o instanceof DisjointRangeDomainHistogram)) { - return false; - } - DisjointRangeDomainHistogram other = (DisjointRangeDomainHistogram) o; - return Objects.equals(source, other.source) && - Objects.equals(sourceSpan, other.sourceSpan) && - Objects.equals(rangeSet, other.rangeSet); - } - - @Override - public int hashCode() - { - return hash(source, sourceSpan, rangeSet); - } -} diff --git a/presto-main/src/main/java/com/facebook/presto/cost/FilterStatsCalculator.java b/presto-main/src/main/java/com/facebook/presto/cost/FilterStatsCalculator.java index 81f7a13d47613..9e35fa478501a 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/FilterStatsCalculator.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/FilterStatsCalculator.java @@ -135,11 +135,10 @@ public PlanNodeStatsEstimate filterStats( public PlanNodeStatsEstimate filterStats( PlanNodeStatsEstimate statsEstimate, RowExpression predicate, - ConnectorSession session, - Optional systemSession) + ConnectorSession session) { RowExpression simplifiedExpression = simplifyExpression(session, predicate); - return new FilterRowExpressionStatsCalculatingVisitor(statsEstimate, session, metadata.getFunctionAndTypeManager(), systemSession).process(simplifiedExpression); + return new FilterRowExpressionStatsCalculatingVisitor(statsEstimate, session, metadata.getFunctionAndTypeManager()).process(simplifiedExpression); } public PlanNodeStatsEstimate filterStats( @@ -147,7 +146,7 @@ public PlanNodeStatsEstimate filterStats( RowExpression predicate, Session session) { - return filterStats(statsEstimate, predicate, session.toConnectorSession(), Optional.of(session)); + return filterStats(statsEstimate, predicate, session.toConnectorSession()); } private Expression simplifyExpression(Session session, Expression predicate, TypeProvider types) @@ -442,13 +441,13 @@ protected PlanNodeStatsEstimate visitComparisonExpression(ComparisonExpression n return visitBooleanLiteral(FALSE_LITERAL, null); } OptionalDouble literal = toStatsRepresentation(metadata, session, getType(left), literalValue); - return estimateExpressionToLiteralComparison(input, leftStats, leftVariable, literal, operator, Optional.of(session)); + return estimateExpressionToLiteralComparison(input, leftStats, leftVariable, literal, operator); } VariableStatsEstimate rightStats = getExpressionStats(right); if (rightStats.isSingleValue()) { OptionalDouble value = isNaN(rightStats.getLowValue()) ? OptionalDouble.empty() : OptionalDouble.of(rightStats.getLowValue()); - return estimateExpressionToLiteralComparison(input, leftStats, leftVariable, value, operator, Optional.of(session)); + return estimateExpressionToLiteralComparison(input, leftStats, leftVariable, value, operator); } Optional rightVariable = right instanceof SymbolReference ? Optional.of(toVariable(right)) : Optional.empty(); @@ -494,15 +493,13 @@ private class FilterRowExpressionStatsCalculatingVisitor { private final PlanNodeStatsEstimate input; private final ConnectorSession session; - private final Optional systemSession; private final FunctionAndTypeManager functionAndTypeManager; - FilterRowExpressionStatsCalculatingVisitor(PlanNodeStatsEstimate input, ConnectorSession session, FunctionAndTypeManager functionAndTypeManager, Optional systemSession) + FilterRowExpressionStatsCalculatingVisitor(PlanNodeStatsEstimate input, ConnectorSession session, FunctionAndTypeManager functionAndTypeManager) { this.input = requireNonNull(input, "input is null"); this.session = requireNonNull(session, "session is null"); this.functionAndTypeManager = requireNonNull(functionAndTypeManager, "functionManager is null"); - this.systemSession = requireNonNull(systemSession, "systemSession is null"); } @Override @@ -589,13 +586,13 @@ public PlanNodeStatsEstimate visitCall(CallExpression node, Void context) return visitConstant(constantNull(right.getSourceLocation(), BOOLEAN), null); } OptionalDouble literal = toStatsRepresentation(metadata.getFunctionAndTypeManager(), session, right.getType(), rightValue); - return estimateExpressionToLiteralComparison(input, leftStats, leftVariable, literal, getComparisonOperator(operatorType), systemSession); + return estimateExpressionToLiteralComparison(input, leftStats, leftVariable, literal, getComparisonOperator(operatorType)); } VariableStatsEstimate rightStats = getRowExpressionStats(right); if (rightStats.isSingleValue()) { OptionalDouble value = isNaN(rightStats.getLowValue()) ? OptionalDouble.empty() : OptionalDouble.of(rightStats.getLowValue()); - return estimateExpressionToLiteralComparison(input, leftStats, leftVariable, value, getComparisonOperator(operatorType), systemSession); + return estimateExpressionToLiteralComparison(input, leftStats, leftVariable, value, getComparisonOperator(operatorType)); } Optional rightVariable = right instanceof VariableReferenceExpression ? Optional.of((VariableReferenceExpression) right) : Optional.empty(); @@ -679,7 +676,7 @@ public PlanNodeStatsEstimate visitInputReference(InputReferenceExpression node, private FilterRowExpressionStatsCalculatingVisitor newEstimate(PlanNodeStatsEstimate input) { - return new FilterRowExpressionStatsCalculatingVisitor(input, session, functionAndTypeManager, systemSession); + return new FilterRowExpressionStatsCalculatingVisitor(input, session, functionAndTypeManager); } private PlanNodeStatsEstimate process(RowExpression rowExpression) diff --git a/presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java b/presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java deleted file mode 100644 index 98accc204b2c2..0000000000000 --- a/presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.cost; - -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import com.google.common.math.DoubleMath; - -import java.util.Optional; - -import static java.lang.Double.isFinite; -import static java.lang.Double.isNaN; -import static java.lang.Math.min; - -public class HistogramCalculator -{ - private HistogramCalculator() - {} - - /** - * Calculates the "filter factor" corresponding to the overlap between the statistic range - * and the histogram distribution. - *
- * The filter factor is a fractional value in [0.0, 1.0] that represents the proportion of - * tuples in the source column that would be included in the result of a filter where the valid - * values in the filter are represented by the {@code range} parameter of this function. - * - * @param range the intersecting range with the histogram - * @param histogram the source histogram - * @param totalDistinctValues the total number of distinct values in the domain of the histogram - * @param useHeuristics whether to return heuristic values based on constants and/or distinct - * value counts. If false, {@link Estimate#unknown()} will be returned in any case a - * heuristic would have been used - * @return an estimate, x, where 0.0 <= x <= 1.0. - */ - public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHistogram histogram, Estimate totalDistinctValues, boolean useHeuristics) - { - boolean openHigh = range.getOpenHigh(); - boolean openLow = range.getOpenLow(); - Estimate min = histogram.inverseCumulativeProbability(0.0); - Estimate max = histogram.inverseCumulativeProbability(1.0); - - // range is either above or below histogram - if ((!max.isUnknown() && max.getValue() < range.getLow()) - || (!min.isUnknown() && min.getValue() > range.getHigh())) { - return Estimate.of(0.0); - } - - // one of the max/min bounds can't be determined - if ((max.isUnknown() && !min.isUnknown()) || (!max.isUnknown() && min.isUnknown())) { - // when the range length is 0, the filter factor should be 1/distinct value count - if (!useHeuristics) { - return Estimate.unknown(); - } - - if (range.length() == 0.0) { - return totalDistinctValues.map(distinct -> 1.0 / distinct); - } - - if (isFinite(range.length())) { - return Estimate.of(StatisticRange.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); - } - return Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); - } - - // we know the bounds are both known, so calculate the percentile for each bound - // The inclusivity arguments can be derived from the open-ness of the interval we're - // calculating the filter factor for - // e.g. given a variable with values in [0, 10] to calculate the filter of - // [1, 9) (openness: false, true) we need the percentile from - // [0.0 to 1.0) (openness: false, true) and from [0.0, 9.0) (openness: false, true) - // thus for the "lowPercentile" calculation we should pass "false" to be non-inclusive - // (same as openness) however, on the high-end we want the inclusivity to be the opposite - // of the openness since if it's open, we _don't_ want to include the bound. - Estimate lowPercentile = histogram.cumulativeProbability(range.getLow(), openLow); - Estimate highPercentile = histogram.cumulativeProbability(range.getHigh(), !openHigh); - - // both bounds are probably infinity, use the infinite-infinite heuristic - if (lowPercentile.isUnknown() || highPercentile.isUnknown()) { - if (!useHeuristics) { - return Estimate.unknown(); - } - // in the case the histogram has no values - if (totalDistinctValues.equals(Estimate.zero()) || range.getDistinctValuesCount() == 0.0) { - return Estimate.of(0.0); - } - - // in the case only one is unknown - if (((lowPercentile.isUnknown() && !highPercentile.isUnknown()) || - (!lowPercentile.isUnknown() && highPercentile.isUnknown())) && - isFinite(range.length())) { - return useHeuristics ? Estimate.of(StatisticRange.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR) : Estimate.unknown(); - } - - if (range.length() == 0.0) { - return totalDistinctValues.map(distinct -> 1.0 / distinct); - } - - if (!isNaN(range.getDistinctValuesCount())) { - return totalDistinctValues.map(distinct -> min(1.0, range.getDistinctValuesCount() / distinct)); - } - - return Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); - } - - // in the case the range is a single value, this can occur if the input - // filter range is a single value (low == high) OR in the case that the - // bounds of the filter or this histogram are infinite. - // in the case of infinite bounds, we should return an estimate that - // correlates to the overlapping distinct values. - if (lowPercentile.equals(highPercentile)) { - if (!useHeuristics) { - return Estimate.zero(); - } - // If one of the bounds is unknown, but both percentiles are equal, - // it's likely that a heuristic value was returned - if (max.isUnknown() || min.isUnknown()) { - return totalDistinctValues.flatMap(distinct -> lowPercentile.map(lowPercent -> distinct * lowPercent)); - } - - return totalDistinctValues.map(distinct -> 1.0 / distinct); - } - - // in the case that we return the entire range, the returned factor percent should be - // proportional to the number of distinct values in the range - if (lowPercentile.equals(Estimate.zero()) && highPercentile.equals(Estimate.of(1.0)) && min.isUnknown() && max.isUnknown()) { - if (!useHeuristics) { - return Estimate.unknown(); - } - - if (totalDistinctValues.equals(Estimate.zero())) { - return Estimate.of(1.0); - } - return totalDistinctValues.flatMap(totalDistinct -> { - if (DoubleMath.fuzzyEquals(totalDistinct, 0.0, 1E-6)) { - return Estimate.unknown(); - } - return Estimate.of(min(1.0, range.getDistinctValuesCount() / totalDistinct)); - }) - // in the case totalDistinct is NaN or 0 - .or(() -> Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR)); - } - - return Optional.of(lowPercentile) - .filter(lowPercent -> !lowPercent.isUnknown()) - .map(Estimate::getValue) - .map(lowPercent -> Optional.of(highPercentile) - .filter(highPercent -> !highPercent.isUnknown()) - .map(Estimate::getValue) - .map(highPercent -> highPercent - lowPercent) - .map(Estimate::of) - .orElseGet(() -> Estimate.of(1.0))) - .orElse(highPercentile); - } -} diff --git a/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java b/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java index 2c66cab3a5230..bf040ea81fa74 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java @@ -33,7 +33,6 @@ import java.util.Queue; import static com.facebook.presto.SystemSessionProperties.getDefaultJoinSelectivityCoefficient; -import static com.facebook.presto.cost.DisjointRangeDomainHistogram.addConjunction; import static com.facebook.presto.cost.FilterStatsCalculator.UNKNOWN_FILTER_COEFFICIENT; import static com.facebook.presto.cost.VariableStatsEstimate.buildFrom; import static com.facebook.presto.expressions.LogicalRowExpressions.extractConjuncts; @@ -247,14 +246,12 @@ private PlanNodeStatsEstimate filterByAuxiliaryClause(PlanNodeStatsEstimate stat .setNullsFraction(0) .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv) - .setHistogram(addConjunction(leftStats.getHistogram(), intersect)) .build(); VariableStatsEstimate newRightStats = buildFrom(rightStats) .setNullsFraction(0) .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv) - .setHistogram(addConjunction(rightStats.getHistogram(), intersect)) .build(); PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.buildFrom(stats) diff --git a/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java b/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java index f62ae79d43db1..5217a69b60898 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java @@ -13,9 +13,6 @@ */ package com.facebook.presto.cost; -import com.facebook.presto.spi.statistics.ConnectorHistogram; - -import static com.facebook.presto.cost.DisjointRangeDomainHistogram.addConjunction; import static com.google.common.base.Preconditions.checkArgument; import static java.lang.Double.NaN; import static java.lang.Double.isNaN; @@ -122,18 +119,16 @@ public static PlanNodeStatsEstimate capStats(PlanNodeStatsEstimate stats, PlanNo // for simplicity keep the average row size the same as in the input // in most cases the average row size doesn't change after applying filters newSymbolStats.setAverageRowSize(symbolStats.getAverageRowSize()); + newSymbolStats.setDistinctValuesCount(min(symbolStats.getDistinctValuesCount(), capSymbolStats.getDistinctValuesCount())); - double newLow = max(symbolStats.getLowValue(), capSymbolStats.getLowValue()); - double newHigh = min(symbolStats.getHighValue(), capSymbolStats.getHighValue()); - newSymbolStats.setLowValue(newLow); - newSymbolStats.setHighValue(newHigh); + newSymbolStats.setLowValue(max(symbolStats.getLowValue(), capSymbolStats.getLowValue())); + newSymbolStats.setHighValue(min(symbolStats.getHighValue(), capSymbolStats.getHighValue())); double numberOfNulls = stats.getOutputRowCount() * symbolStats.getNullsFraction(); double capNumberOfNulls = cap.getOutputRowCount() * capSymbolStats.getNullsFraction(); double cappedNumberOfNulls = min(numberOfNulls, capNumberOfNulls); double cappedNullsFraction = cappedRowCount == 0 ? 1 : cappedNumberOfNulls / cappedRowCount; newSymbolStats.setNullsFraction(cappedNullsFraction); - newSymbolStats.setHistogram(addConjunction(symbolStats.getHistogram(), new StatisticRange(newLow, newHigh, 0))); result.addVariableStatistics(symbol, newSymbolStats.build()); }); @@ -149,44 +144,25 @@ private static PlanNodeStatsEstimate createZeroStats(PlanNodeStatsEstimate stats return result.build(); } - protected enum RangeAdditionStrategy - { - ADD_AND_SUM_DISTINCT(StatisticRange::addAndSumDistinctValues), - ADD_AND_MAX_DISTINCT(StatisticRange::addAndMaxDistinctValues), - ADD_AND_COLLAPSE_DISTINCT(StatisticRange::addAndCollapseDistinctValues), - INTERSECT(StatisticRange::intersect); - private final RangeAdditionFunction rangeAdditionFunction; - - RangeAdditionStrategy(RangeAdditionFunction rangeAdditionFunction) - { - this.rangeAdditionFunction = rangeAdditionFunction; - } - - public RangeAdditionFunction getRangeAdditionFunction() - { - return rangeAdditionFunction; - } - } - @FunctionalInterface - protected interface RangeAdditionFunction + private interface RangeAdditionStrategy { StatisticRange add(StatisticRange leftRange, StatisticRange rightRange); } public static PlanNodeStatsEstimate addStatsAndSumDistinctValues(PlanNodeStatsEstimate left, PlanNodeStatsEstimate right) { - return addStats(left, right, RangeAdditionStrategy.ADD_AND_SUM_DISTINCT); + return addStats(left, right, StatisticRange::addAndSumDistinctValues); } public static PlanNodeStatsEstimate addStatsAndMaxDistinctValues(PlanNodeStatsEstimate left, PlanNodeStatsEstimate right) { - return addStats(left, right, RangeAdditionStrategy.ADD_AND_MAX_DISTINCT); + return addStats(left, right, StatisticRange::addAndMaxDistinctValues); } public static PlanNodeStatsEstimate addStatsAndCollapseDistinctValues(PlanNodeStatsEstimate left, PlanNodeStatsEstimate right) { - return addStats(left, right, RangeAdditionStrategy.ADD_AND_COLLAPSE_DISTINCT); + return addStats(left, right, StatisticRange::addAndCollapseDistinctValues); } public static PlanNodeStatsEstimate addStatsAndIntersect(PlanNodeStatsEstimate left, PlanNodeStatsEstimate right) @@ -209,7 +185,7 @@ public static PlanNodeStatsEstimate addStatsAndIntersect(PlanNodeStatsEstimate l right.getOutputRowCount() * rstats.overlapPercentWith(lstats)); }).reduce(Math::min).orElse(estimatedRowCount); - buildVariableStatistics(left, right, statsBuilder, rowCount, RangeAdditionStrategy.INTERSECT); + buildVariableStatistics(left, right, statsBuilder, rowCount, StatisticRange::intersect); return statsBuilder.setOutputRowCount(rowCount).build(); } @@ -217,7 +193,7 @@ public static PlanNodeStatsEstimate addStatsAndIntersect(PlanNodeStatsEstimate l private static PlanNodeStatsEstimate addStats( PlanNodeStatsEstimate left, PlanNodeStatsEstimate right, - RangeAdditionStrategy strategy) + RangeAdditionStrategy rangeAdder) { double rowCount = left.getOutputRowCount() + right.getOutputRowCount(); double totalSize = left.getTotalSize() + right.getTotalSize(); @@ -227,7 +203,7 @@ private static PlanNodeStatsEstimate addStats( } PlanNodeStatsEstimate.Builder statsBuilder = PlanNodeStatsEstimate.builder(); - buildVariableStatistics(left, right, statsBuilder, rowCount, strategy); + buildVariableStatistics(left, right, statsBuilder, rowCount, rangeAdder); return statsBuilder.setOutputRowCount(rowCount) .setTotalSize(totalSize).build(); @@ -238,7 +214,7 @@ private static void buildVariableStatistics( PlanNodeStatsEstimate right, PlanNodeStatsEstimate.Builder statsBuilder, double estimatedRowCount, - RangeAdditionStrategy strategy) + RangeAdditionStrategy rangeAdder) { concat(left.getVariablesWithKnownStatistics().stream(), right.getVariablesWithKnownStatistics().stream()) .distinct() @@ -254,7 +230,7 @@ else if (estimatedRowCount > 0) { right.getVariableStatistics(symbol), right.getOutputRowCount(), estimatedRowCount, - strategy); + rangeAdder); } statsBuilder.addVariableStatistics(symbol, symbolStats); }); @@ -273,7 +249,7 @@ private static VariableStatsEstimate addColumnStats( StatisticRange leftRange = StatisticRange.from(leftStats); StatisticRange rightRange = StatisticRange.from(rightStats); - StatisticRange sum = strategy.getRangeAdditionFunction().add(leftRange, rightRange); + StatisticRange sum = strategy.add(leftRange, rightRange); double nullsCountRight = rightStats.getNullsFraction() * rightRows; double nullsCountLeft = leftStats.getNullsFraction() * leftRows; double totalSizeLeft = (leftRows - nullsCountLeft) * leftStats.getAverageRowSize(); @@ -284,15 +260,10 @@ private static VariableStatsEstimate addColumnStats( // FIXME, weights to average. left and right should be equal in most cases anyway double newAverageRowSize = newNonNullsRowCount == 0 ? 0 : ((totalSizeLeft + totalSizeRight) / newNonNullsRowCount); - ConnectorHistogram newHistogram = RangeAdditionStrategy.INTERSECT.equals(strategy) ? - DisjointRangeDomainHistogram.addConjunction(leftStats.getHistogram(), rightRange) : - DisjointRangeDomainHistogram.addDisjunction(leftStats.getHistogram(), rightRange); - return VariableStatsEstimate.builder() .setStatisticsRange(sum) .setAverageRowSize(newAverageRowSize) .setNullsFraction(newNullsFraction) - .setHistogram(newHistogram) .build(); } } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java b/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java index 4d80e13cb92cc..b613f655dc01d 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java @@ -13,19 +13,11 @@ */ package com.facebook.presto.cost; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.common.collect.BoundType; -import com.google.common.collect.Range; - import java.util.Objects; -import static com.facebook.presto.util.MoreMath.nearlyEqual; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkArgument; -import static java.lang.Double.NEGATIVE_INFINITY; import static java.lang.Double.NaN; -import static java.lang.Double.POSITIVE_INFINITY; import static java.lang.Double.isFinite; import static java.lang.Double.isInfinite; import static java.lang.Double.isNaN; @@ -36,36 +28,22 @@ public class StatisticRange { - protected static final double INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.25; - protected static final double INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.5; + private static final double INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.25; + private static final double INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.5; // TODO unify field and method names with SymbolStatsEstimate /** * {@code NaN} represents empty range ({@code high} must be {@code NaN} too) */ private final double low; - - /** - * Whether the low side of the range is open. e.g. The value is *not* included in the range. - */ - private final boolean openLow; /** * {@code NaN} represents empty range ({@code low} must be {@code NaN} too) */ private final double high; - /** - * Whether the high side of the range is open. e.g. the value is *not* included in the range. - */ - private final boolean openHigh; private final double distinctValues; - @JsonCreator - public StatisticRange(@JsonProperty("low") double low, - @JsonProperty("openLow") boolean openLow, - @JsonProperty("high") double high, - @JsonProperty("openHigh") boolean openHigh, - @JsonProperty("distinctValuesCount") double distinctValues) + public StatisticRange(double low, double high, double distinctValues) { checkArgument( low <= high || (isNaN(low) && isNaN(high)), @@ -74,58 +52,36 @@ public StatisticRange(@JsonProperty("low") double low, high); this.low = low; this.high = high; - this.openLow = openLow; - this.openHigh = openHigh; checkArgument(distinctValues >= 0 || isNaN(distinctValues), "Distinct values count should be non-negative, got: %s", distinctValues); this.distinctValues = distinctValues; } - public StatisticRange(double low, double high, double distinctValues) - { - this(low, false, high, false, distinctValues); - } - public static StatisticRange empty() { - return new StatisticRange(NaN, false, NaN, false, 0); + return new StatisticRange(NaN, NaN, 0); } public static StatisticRange from(VariableStatsEstimate estimate) { - return new StatisticRange(estimate.getLowValue(), false, estimate.getHighValue(), false, estimate.getDistinctValuesCount()); + return new StatisticRange(estimate.getLowValue(), estimate.getHighValue(), estimate.getDistinctValuesCount()); } - @JsonProperty public double getLow() { return low; } - @JsonProperty public double getHigh() { return high; } - @JsonProperty public double getDistinctValuesCount() { return distinctValues; } - @JsonProperty - public boolean getOpenLow() - { - return openLow; - } - - @JsonProperty - public boolean getOpenHigh() - { - return openHigh; - } - public double length() { return high - low; @@ -186,14 +142,9 @@ private double overlappingDistinctValues(StatisticRange other) public StatisticRange intersect(StatisticRange other) { double newLow = max(low, other.low); - boolean newOpenLow = newLow == low ? openLow : other.openLow; - // epsilon is an arbitrary choice - newOpenLow = nearlyEqual(low, other.low, 1E-10) ? openLow || other.openLow : newOpenLow; double newHigh = min(high, other.high); - boolean newOpenHigh = newHigh == high ? openHigh : other.openHigh; - newOpenHigh = nearlyEqual(high, other.high, 1E-10) ? openHigh || other.openHigh : newOpenHigh; if (newLow <= newHigh) { - return new StatisticRange(newLow, newOpenLow, newHigh, newOpenHigh, overlappingDistinctValues(other)); + return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other)); } return empty(); } @@ -201,13 +152,13 @@ public StatisticRange intersect(StatisticRange other) public StatisticRange addAndSumDistinctValues(StatisticRange other) { double newDistinctValues = distinctValues + other.distinctValues; - return expandRangeWithNewDistinct(newDistinctValues, other); + return new StatisticRange(minExcludeNaN(low, other.low), maxExcludeNaN(high, other.high), newDistinctValues); } public StatisticRange addAndMaxDistinctValues(StatisticRange other) { double newDistinctValues = max(distinctValues, other.distinctValues); - return expandRangeWithNewDistinct(newDistinctValues, other); + return new StatisticRange(minExcludeNaN(low, other.low), maxExcludeNaN(high, other.high), newDistinctValues); } public StatisticRange addAndCollapseDistinctValues(StatisticRange other) @@ -219,41 +170,7 @@ public StatisticRange addAndCollapseDistinctValues(StatisticRange other) double maxOverlappingValues = max(overlapDistinctValuesThis, overlapDistinctValuesOther); double newDistinctValues = maxOverlappingValues + (1 - overlapPercentOfThis) * distinctValues + (1 - overlapPercentOfOther) * other.distinctValues; - return expandRangeWithNewDistinct(newDistinctValues, other); - } - - public Range toRange() - { - return Range.range(low, openLow ? BoundType.OPEN : BoundType.CLOSED, high, openHigh ? BoundType.OPEN : BoundType.CLOSED); - } - - public static StatisticRange fromRange(Range range) - { - return new StatisticRange( - range.hasLowerBound() ? range.lowerEndpoint() : NEGATIVE_INFINITY, - !range.hasLowerBound() || range.lowerBoundType() == BoundType.OPEN, - range.hasUpperBound() ? range.upperEndpoint() : POSITIVE_INFINITY, - !range.hasUpperBound() || range.upperBoundType() == BoundType.OPEN, - NaN); - } - - private StatisticRange expandRangeWithNewDistinct(double newDistinctValues, StatisticRange other) - { - double newLow = minExcludeNaN(low, other.low); - boolean newOpenLow = getNewEndpointOpennessLow(this, other, newLow); - double newHigh = maxExcludeNaN(high, other.high); - boolean newOpenHigh = getNewEndpointOpennessHigh(this, other, newHigh); - return new StatisticRange(newLow, newOpenLow, newHigh, newOpenHigh, newDistinctValues); - } - - private static boolean getNewEndpointOpennessLow(StatisticRange first, StatisticRange second, double newLow) - { - return newLow == first.low ? first.openLow : second.openLow; - } - - private static boolean getNewEndpointOpennessHigh(StatisticRange first, StatisticRange second, double newHigh) - { - return newHigh == first.high ? first.openHigh : second.openHigh; + return new StatisticRange(minExcludeNaN(low, other.low), maxExcludeNaN(high, other.high), newDistinctValues); } private static double minExcludeNaN(double v1, double v2) @@ -289,23 +206,21 @@ public boolean equals(Object o) } StatisticRange that = (StatisticRange) o; return Double.compare(that.low, low) == 0 && - that.openLow == openLow && Double.compare(that.high, high) == 0 && - that.openHigh == openHigh && Double.compare(that.distinctValues, distinctValues) == 0; } @Override public int hashCode() { - return Objects.hash(low, openLow, high, openHigh, distinctValues); + return Objects.hash(low, high, distinctValues); } @Override public String toString() { return toStringHelper(this) - .add("range", format("%s%s..%s%s", openLow ? "(" : "[", low, high, openHigh ? ")" : "]")) + .add("range", format("[%s-%s]", low, high)) .add("ndv", distinctValues) .toString(); } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/StatsUtil.java b/presto-main/src/main/java/com/facebook/presto/cost/StatsUtil.java index 63f3c061c7df8..3e33257e3f57e 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/StatsUtil.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/StatsUtil.java @@ -90,7 +90,6 @@ public static VariableStatsEstimate toVariableStatsEstimate(TableStatistics tabl result.setLowValue(range.getMin()); result.setHighValue(range.getMax()); }); - columnStatistics.getHistogram().ifPresent(result::setHistogram); return result.build(); } } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java b/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java deleted file mode 100644 index d06232d1fb608..0000000000000 --- a/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.cost; - -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Verify.verify; -import static java.lang.Double.isInfinite; -import static java.lang.Double.isNaN; -import static java.lang.Math.max; -import static java.lang.Math.min; -import static java.util.Objects.hash; - -/** - * This {@link ConnectorHistogram} implementation returns values assuming a - * uniform distribution between a given high and low value. - *
- * In the case that statistics don't exist for a particular table, the Presto - * optimizer will fall back on this uniform distribution assumption. - */ -public class UniformDistributionHistogram - implements ConnectorHistogram -{ - private final double lowValue; - private final double highValue; - - @JsonCreator - public UniformDistributionHistogram( - @JsonProperty("lowValue") double lowValue, - @JsonProperty("highValue") double highValue) - { - verify(isNaN(lowValue) || isNaN(highValue) || (lowValue <= highValue), "lowValue must be <= highValue"); - this.lowValue = lowValue; - this.highValue = highValue; - } - - @JsonProperty - public double getLowValue() - { - return lowValue; - } - - @JsonProperty - public double getHighValue() - { - return highValue; - } - - @Override - public Estimate cumulativeProbability(double value, boolean inclusive) - { - if (isNaN(lowValue) || - isNaN(highValue) || - isNaN(value)) { - return Estimate.unknown(); - } - - if (value >= highValue) { - return Estimate.of(1.0); - } - - if (value <= lowValue) { - return Estimate.of(0.0); - } - - if (isInfinite(lowValue) || isInfinite(highValue)) { - return Estimate.unknown(); - } - - return Estimate.of(min(1.0, max(0.0, ((value - lowValue) / (highValue - lowValue))))); - } - - @Override - public Estimate inverseCumulativeProbability(double percentile) - { - checkArgument(percentile >= 0.0 && percentile <= 1.0, "percentile must be in [0.0, 1.0]: " + percentile); - if (isNaN(lowValue) || - isNaN(highValue)) { - return Estimate.unknown(); - } - - if (percentile == 0.0 && !isInfinite(lowValue)) { - return Estimate.of(lowValue); - } - - if (percentile == 1.0 && !isInfinite(highValue)) { - return Estimate.of(highValue); - } - - if (isInfinite(lowValue) || isInfinite(highValue)) { - return Estimate.unknown(); - } - - return Estimate.of(lowValue + (percentile * (highValue - lowValue))); - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("lowValue", lowValue) - .add("highValue", highValue) - .toString(); - } - - @Override - public boolean equals(Object o) - { - if (this == o) { - return true; - } - if (!(o instanceof UniformDistributionHistogram)) { - return false; - } - - UniformDistributionHistogram other = (UniformDistributionHistogram) o; - return equalsOrBothNaN(lowValue, other.lowValue) && - equalsOrBothNaN(highValue, other.highValue); - } - - @Override - public int hashCode() - { - return hash(lowValue, highValue); - } - - private static boolean equalsOrBothNaN(Double first, Double second) - { - return first.equals(second) || (Double.isNaN(first) && Double.isNaN(second)); - } -} diff --git a/presto-main/src/main/java/com/facebook/presto/cost/VariableStatsEstimate.java b/presto-main/src/main/java/com/facebook/presto/cost/VariableStatsEstimate.java index 01fd61b2cae24..2e13cf5a0fde7 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/VariableStatsEstimate.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/VariableStatsEstimate.java @@ -13,12 +13,10 @@ */ package com.facebook.presto.cost; -import com.facebook.presto.spi.statistics.ConnectorHistogram; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import java.util.Objects; -import java.util.Optional; import java.util.function.Function; import static com.google.common.base.MoreObjects.toStringHelper; @@ -29,7 +27,6 @@ import static java.lang.Double.isInfinite; import static java.lang.Double.isNaN; import static java.lang.String.format; -import static java.util.Objects.requireNonNull; public class VariableStatsEstimate { @@ -42,7 +39,6 @@ public class VariableStatsEstimate private final double nullsFraction; private final double averageRowSize; private final double distinctValuesCount; - private final ConnectorHistogram histogram; public static VariableStatsEstimate unknown() { @@ -60,8 +56,7 @@ public VariableStatsEstimate( @JsonProperty("highValue") double highValue, @JsonProperty("nullsFraction") double nullsFraction, @JsonProperty("averageRowSize") double averageRowSize, - @JsonProperty("distinctValuesCount") double distinctValuesCount, - @JsonProperty("histogram") ConnectorHistogram histogram) + @JsonProperty("distinctValuesCount") double distinctValuesCount) { checkArgument( lowValue <= highValue || (isNaN(lowValue) && isNaN(highValue)), @@ -84,16 +79,6 @@ public VariableStatsEstimate( checkArgument(distinctValuesCount >= 0 || isNaN(distinctValuesCount), "Distinct values count should be non-negative, got: %s", distinctValuesCount); // TODO normalize distinctValuesCount for an empty range (or validate it is already normalized) this.distinctValuesCount = distinctValuesCount; - this.histogram = requireNonNull(histogram, "histogram is null"); - } - - public VariableStatsEstimate(double lowValue, - double highValue, - double nullsFraction, - double averageRowSize, - double distinctValuesCount) - { - this(lowValue, highValue, nullsFraction, averageRowSize, distinctValuesCount, new UniformDistributionHistogram(lowValue, highValue)); } @JsonProperty @@ -114,12 +99,6 @@ public double getNullsFraction() return nullsFraction; } - @JsonProperty - public ConnectorHistogram getHistogram() - { - return histogram; - } - public StatisticRange statisticRange() { return new StatisticRange(lowValue, highValue, distinctValuesCount); @@ -174,8 +153,6 @@ public boolean equals(Object o) return false; } VariableStatsEstimate that = (VariableStatsEstimate) o; - // histograms are explicitly left out because equals calculations would - // be expensive. return Double.compare(nullsFraction, that.nullsFraction) == 0 && Double.compare(averageRowSize, that.averageRowSize) == 0 && Double.compare(distinctValuesCount, that.distinctValuesCount) == 0 && @@ -197,7 +174,6 @@ public String toString() .add("nulls", nullsFraction) .add("ndv", distinctValuesCount) .add("rowSize", averageRowSize) - .add("histogram", histogram) .toString(); } @@ -213,8 +189,7 @@ public static Builder buildFrom(VariableStatsEstimate other) .setHighValue(other.getHighValue()) .setNullsFraction(other.getNullsFraction()) .setAverageRowSize(other.getAverageRowSize()) - .setDistinctValuesCount(other.getDistinctValuesCount()) - .setHistogram(other.getHistogram()); + .setDistinctValuesCount(other.getDistinctValuesCount()); } public static final class Builder @@ -224,7 +199,6 @@ public static final class Builder private double nullsFraction = NaN; private double averageRowSize = NaN; private double distinctValuesCount = NaN; - private Optional histogram = Optional.empty(); public Builder setStatisticsRange(StatisticRange range) { @@ -263,16 +237,9 @@ public Builder setDistinctValuesCount(double distinctValuesCount) return this; } - public Builder setHistogram(ConnectorHistogram histogram) - { - this.histogram = Optional.of(histogram); - return this; - } - public VariableStatsEstimate build() { - return new VariableStatsEstimate(lowValue, highValue, nullsFraction, averageRowSize, distinctValuesCount, - histogram.orElseGet(() -> new UniformDistributionHistogram(lowValue, highValue))); + return new VariableStatsEstimate(lowValue, highValue, nullsFraction, averageRowSize, distinctValuesCount); } } } diff --git a/presto-main/src/main/java/com/facebook/presto/sql/analyzer/FeaturesConfig.java b/presto-main/src/main/java/com/facebook/presto/sql/analyzer/FeaturesConfig.java index 95aac7243cbb9..8d13f68ffa6a1 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/analyzer/FeaturesConfig.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/analyzer/FeaturesConfig.java @@ -308,7 +308,6 @@ public class FeaturesConfig private boolean limitNumberOfGroupsForKHyperLogLogAggregations = true; private boolean generateDomainFilters; private CreateView.Security defaultViewSecurityMode = DEFINER; - private boolean useHistograms; public enum PartitioningPrecisionStrategy { @@ -3102,17 +3101,4 @@ public FeaturesConfig setDefaultViewSecurityMode(CreateView.Security securityMod this.defaultViewSecurityMode = securityMode; return this; } - - public boolean isUseHistograms() - { - return useHistograms; - } - - @Config("optimizer.use-histograms") - @ConfigDescription("Use histogram statistics in cost-based calculations in the optimizer") - public FeaturesConfig setUseHistograms(boolean useHistograms) - { - this.useHistograms = useHistograms; - return this; - } } diff --git a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java index 7a7396824e091..f1679c77eea61 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java @@ -265,7 +265,6 @@ private static List buildColumnsNames() .add("row_count") .add("low_value") .add("high_value") - .add("histogram") .build(); } @@ -311,7 +310,6 @@ private Row createColumnStatsRow(String columnName, Type type, ColumnStatistics rowValues.add(NULL_DOUBLE); rowValues.add(toStringLiteral(type, columnStatistics.getRange().map(DoubleRange::getMin))); rowValues.add(toStringLiteral(type, columnStatistics.getRange().map(DoubleRange::getMax))); - rowValues.add(columnStatistics.getHistogram().map(Objects::toString).map(StringLiteral::new).orElse(NULL_VARCHAR)); return new Row(rowValues.build()); } @@ -325,7 +323,6 @@ private Expression createEmptyColumnStatsRow(String columnName) rowValues.add(NULL_DOUBLE); rowValues.add(NULL_VARCHAR); rowValues.add(NULL_VARCHAR); - rowValues.add(NULL_VARCHAR); return new Row(rowValues.build()); } @@ -339,7 +336,6 @@ private static Row createTableStatsRow(TableStatistics tableStatistics) rowValues.add(createEstimateRepresentation(tableStatistics.getRowCount())); rowValues.add(NULL_VARCHAR); rowValues.add(NULL_VARCHAR); - rowValues.add(NULL_VARCHAR); return new Row(rowValues.build()); } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/AbstractTestComparisonStatsCalculator.java b/presto-main/src/test/java/com/facebook/presto/cost/TestComparisonStatsCalculator.java similarity index 99% rename from presto-main/src/test/java/com/facebook/presto/cost/AbstractTestComparisonStatsCalculator.java rename to presto-main/src/test/java/com/facebook/presto/cost/TestComparisonStatsCalculator.java index 66c2ceef405fd..310b35dcfc1e4 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/AbstractTestComparisonStatsCalculator.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestComparisonStatsCalculator.java @@ -37,7 +37,6 @@ import java.util.Optional; import java.util.function.Consumer; -import static com.facebook.presto.SystemSessionProperties.OPTIMIZER_USE_HISTOGRAMS; import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static com.facebook.presto.sql.tree.ComparisonExpression.Operator.EQUAL; import static com.facebook.presto.sql.tree.ComparisonExpression.Operator.GREATER_THAN; @@ -52,8 +51,7 @@ import static java.lang.String.format; import static java.util.stream.Collectors.joining; -@Test -public abstract class AbstractTestComparisonStatsCalculator +public class TestComparisonStatsCalculator { private FilterStatsCalculator filterStatsCalculator; private Session session; @@ -70,17 +68,11 @@ public abstract class AbstractTestComparisonStatsCalculator private VariableStatsEstimate emptyRangeStats; private VariableStatsEstimate varcharStats; - public AbstractTestComparisonStatsCalculator(boolean withHistograms) - { - session = testSessionBuilder() - .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, Boolean.toString(withHistograms)) - .build(); - } - @BeforeClass public void setUp() throws Exception { + session = testSessionBuilder().build(); MetadataManager metadata = MetadataManager.createTestMetadataManager(); filterStatsCalculator = new FilterStatsCalculator(metadata, new ScalarStatsCalculator(metadata), new StatsNormalizer()); diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestComparisonStatsCalculatorHistograms.java b/presto-main/src/test/java/com/facebook/presto/cost/TestComparisonStatsCalculatorHistograms.java deleted file mode 100644 index e46090a11d333..0000000000000 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestComparisonStatsCalculatorHistograms.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.facebook.presto.cost; - -public class TestComparisonStatsCalculatorHistograms - extends AbstractTestComparisonStatsCalculator -{ - public TestComparisonStatsCalculatorHistograms() - { - super(true); - } -} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestComparisonStatsCalculatorNoHistograms.java b/presto-main/src/test/java/com/facebook/presto/cost/TestComparisonStatsCalculatorNoHistograms.java deleted file mode 100644 index 1fb353fd9fa21..0000000000000 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestComparisonStatsCalculatorNoHistograms.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.facebook.presto.cost; - -public class TestComparisonStatsCalculatorNoHistograms - extends AbstractTestComparisonStatsCalculator -{ - public TestComparisonStatsCalculatorNoHistograms() - { - super(false); - } -} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestConnectorFilterStatsCalculatorService.java b/presto-main/src/test/java/com/facebook/presto/cost/TestConnectorFilterStatsCalculatorService.java index 584f55140e3a7..5bc06c514ddbb 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestConnectorFilterStatsCalculatorService.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestConnectorFilterStatsCalculatorService.java @@ -98,20 +98,20 @@ public void testTableStatisticsAfterFilter() TableStatistics filteredToZeroStatistics = TableStatistics.builder() .setRowCount(Estimate.zero()) .setTotalSize(Estimate.zero()) - .setColumnStatistics(xColumn, new ColumnStatistics(Estimate.of(1.0), Estimate.zero(), Estimate.zero(), Optional.empty(), Optional.empty())) + .setColumnStatistics(xColumn, new ColumnStatistics(Estimate.of(1.0), Estimate.zero(), Estimate.zero(), Optional.empty())) .build(); assertPredicate("false", originalTableStatistics, filteredToZeroStatistics); TableStatistics filteredStatistics = TableStatistics.builder() .setRowCount(Estimate.of(37.5)) .setTotalSize(Estimate.of(300)) - .setColumnStatistics(xColumn, new ColumnStatistics(Estimate.zero(), Estimate.of(20), Estimate.unknown(), Optional.of(new DoubleRange(-10, 0)), Optional.empty())) + .setColumnStatistics(xColumn, new ColumnStatistics(Estimate.zero(), Estimate.of(20), Estimate.unknown(), Optional.of(new DoubleRange(-10, 0)))) .build(); assertPredicate("x < 0", originalTableStatistics, filteredStatistics); TableStatistics filteredStatisticsWithoutTotalSize = TableStatistics.builder() .setRowCount(Estimate.of(37.5)) - .setColumnStatistics(xColumn, new ColumnStatistics(Estimate.zero(), Estimate.of(20), Estimate.unknown(), Optional.of(new DoubleRange(-10, 0)), Optional.empty())) + .setColumnStatistics(xColumn, new ColumnStatistics(Estimate.zero(), Estimate.of(20), Estimate.unknown(), Optional.of(new DoubleRange(-10, 0)))) .build(); assertPredicate("x < 0", originalTableStatisticsWithoutTotalSize, filteredStatisticsWithoutTotalSize); } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java b/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java deleted file mode 100644 index 1cbddcc781c58..0000000000000 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java +++ /dev/null @@ -1,288 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.cost; - -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Range; -import org.apache.commons.math3.distribution.NormalDistribution; -import org.apache.commons.math3.distribution.RealDistribution; -import org.apache.commons.math3.distribution.UniformRealDistribution; -import org.testng.annotations.Test; - -import java.util.List; -import java.util.stream.Collectors; - -import static org.testng.Assert.assertEquals; - -public class TestDisjointRangeDomainHistogram - extends TestHistogram -{ - /** - * A uniform base with 2 ranges that are fully within the range of the uniform histogram. - */ - @Test - public void testBasicDisjointRanges() - { - ConnectorHistogram source = new UniformDistributionHistogram(0, 100); - ConnectorHistogram constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(0d, 25d))); - constrained = DisjointRangeDomainHistogram - .addDisjunction(constrained, StatisticRange.fromRange(Range.open(75d, 100d))); - assertEquals(constrained.inverseCumulativeProbability(0.75).getValue(), 87.5); - assertEquals(constrained.inverseCumulativeProbability(0.0).getValue(), 0.0); - assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 100); - assertEquals(constrained.inverseCumulativeProbability(0.5).getValue(), 25); - } - - /** - * A uniform base with a range that (1) doesn't have any overlap with the base distribution (2) - * has partial overlap (both ends of the base) and (3) complete overlap. - */ - @Test - public void testSingleDisjointRange() - { - ConnectorHistogram source = new UniformDistributionHistogram(0, 10); - - // no overlap, left bound - ConnectorHistogram constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(-10d, -5d))); - for (int i = -11; i < 12; i++) { - assertEquals(constrained.cumulativeProbability(i, true).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(i, false).getValue(), 0.0, 1E-8); - } - assertEquals(constrained.inverseCumulativeProbability(0.0), Estimate.unknown()); - assertEquals(constrained.inverseCumulativeProbability(1.0), Estimate.unknown()); - - // partial overlap left bound - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(-2d, 2d))); - assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(-1, false).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(1, false).getValue(), 0.5, 1E-8); - assertEquals(constrained.cumulativeProbability(1.5, false).getValue(), 0.75, 1E-8); - assertEquals(constrained.cumulativeProbability(2, false).getValue(), 1.0, 1E-8); - assertEquals(constrained.cumulativeProbability(4, false).getValue(), 1.0, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(0.0).getValue(), 0d, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(0.5).getValue(), 1d, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(0.75).getValue(), 1.5d, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 2d, 1E-8); - - //full overlap - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(3d, 4d))); - assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(1, false).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(3, false).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(3.5, false).getValue(), 0.5, 1E-8); - assertEquals(constrained.cumulativeProbability(4, false).getValue(), 1.0, 1E-8); - assertEquals(constrained.cumulativeProbability(4.5, false).getValue(), 1.0, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(0.0).getValue(), 3d, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(0.5).getValue(), 3.5d, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(0.75).getValue(), 3.75d, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 4d, 1E-8); - - //right side overlap - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(8d, 12d))); - assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(5, false).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(8, false).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(9, false).getValue(), 0.5, 1E-8); - assertEquals(constrained.cumulativeProbability(9.5, false).getValue(), 0.75, 1E-8); - assertEquals(constrained.cumulativeProbability(10, false).getValue(), 1.0, 1E-8); - assertEquals(constrained.cumulativeProbability(11, false).getValue(), 1.0, 1E-8); - assertEquals(constrained.cumulativeProbability(12, false).getValue(), 1.0, 1E-8); - assertEquals(constrained.cumulativeProbability(13, false).getValue(), 1.0, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(0.0).getValue(), 8d, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(0.5).getValue(), 9d, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(0.75).getValue(), 9.5d, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 10d, 1E-8); - - // no overlap, right bound - constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(15d, 20d))); - for (int i = 15; i < 20; i++) { - assertEquals(constrained.cumulativeProbability(i, true).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(i, false).getValue(), 0.0, 1E-8); - } - assertEquals(constrained.inverseCumulativeProbability(0.0), Estimate.unknown()); - assertEquals(constrained.inverseCumulativeProbability(1.0), Estimate.unknown()); - } - - /** - * Tests that calculations across N > 1 disjunctions applied to the source histogram are - * calculated properly. - */ - @Test - public void testMultipleDisjunction() - { - StandardNormalHistogram source = new StandardNormalHistogram(); - RealDistribution dist = source.getDistribution(); - ConnectorHistogram constrained = disjunction(source, Range.closed(-2d, -1d)); - constrained = disjunction(constrained, Range.closed(1d, 2d)); - double rangeLeftProb = dist.cumulativeProbability(-1) - dist.cumulativeProbability(-2); - double rangeRightProb = dist.cumulativeProbability(2) - dist.cumulativeProbability(1); - double sumRangeProb = rangeLeftProb + rangeRightProb; - assertEquals(constrained.cumulativeProbability(-2, true).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(-1.5, true).getValue(), (dist.cumulativeProbability(-1.5d) - dist.cumulativeProbability(-2)) / sumRangeProb, 1E-8); - assertEquals(constrained.cumulativeProbability(-1, true).getValue(), 0.5, 1E-8); - assertEquals(constrained.cumulativeProbability(1, true).getValue(), 0.5, 1E-8); - assertEquals(constrained.cumulativeProbability(1.5, true).getValue(), (rangeLeftProb / sumRangeProb) + ((dist.cumulativeProbability(1.5) - dist.cumulativeProbability(1.0)) / sumRangeProb)); - assertEquals(constrained.cumulativeProbability(2, true).getValue(), 1.0, 1E-8); - assertEquals(constrained.cumulativeProbability(3, true).getValue(), 1.0, 1E-8); - } - - /** - * Ensures assumptions made in tests for uniform distributions apply correctly for - * a non-uniform distribution. - */ - @Test - public void testNormalDistribution() - { - // standard normal - StandardNormalHistogram source = new StandardNormalHistogram(); - RealDistribution dist = source.getDistribution(); - ConnectorHistogram constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(-1d, 1d))); - assertEquals(constrained.cumulativeProbability(-1.0, true).getValue(), 0.0, 1E-8); - assertEquals(constrained.cumulativeProbability(0.0, true).getValue(), 0.5, 1E-8); - assertEquals(constrained.cumulativeProbability(1.0, true).getValue(), 1.0, 1E-8); - double probability = (dist.cumulativeProbability(-0.5) - dist.cumulativeProbability(-1.0)) / (dist.cumulativeProbability(1.0) - dist.cumulativeProbability(-1)); - assertEquals(constrained.cumulativeProbability(-0.5, true).getValue(), probability, 1E-8); - assertEquals(constrained.cumulativeProbability(0.5, true).getValue(), probability + (1.0 - (2 * probability)), 1E-8); - - assertEquals(constrained.inverseCumulativeProbability(0.0).getValue(), -1.0d, 1E-8); - probability = dist.inverseCumulativeProbability(dist.cumulativeProbability(-1) + 0.25 * (dist.cumulativeProbability(1) - dist.cumulativeProbability(-1))); - assertEquals(constrained.inverseCumulativeProbability(0.25).getValue(), -0.44177054668d, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(0.5).getValue(), 0.0d, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(0.75).getValue(), -1 * probability, 1E-8); - assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 1.0d, 1E-8); - } - - /** - * Ensures disjunctions of ranges works properly - */ - @Test - public void testAddDisjunction() - { - ConnectorHistogram source = new UniformDistributionHistogram(0, 100); - DisjointRangeDomainHistogram constrained = disjunction(source, Range.open(-1d, 2d)); - assertEquals(constrained.getRanges().size(), 1); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 2d)); - constrained = disjunction(constrained, Range.open(1d, 10d)); - assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 10d)); - constrained = disjunction(constrained, Range.closedOpen(50d, 100d)); - assertEquals(ranges(constrained).size(), 2); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 10d)); - assertEquals(ranges(constrained).get(1), Range.closedOpen(50d, 100d)); - } - - /** - * Ensures conjunctions of ranges works properly - */ - @Test - public void testAddConjunction() - { - ConnectorHistogram source = new UniformDistributionHistogram(0, 100); - DisjointRangeDomainHistogram constrained = disjunction(source, Range.open(10d, 90d)); - assertEquals(constrained.getRanges().size(), 1); - assertEquals(ranges(constrained).get(0), Range.open(10d, 90d)); - constrained = conjunction(constrained, Range.atMost(50d)); - assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.openClosed(10d, 50d)); - constrained = conjunction(constrained, Range.atLeast(25d)); - assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.closed(25d, 50d)); - } - - private static DisjointRangeDomainHistogram disjunction(ConnectorHistogram source, Range range) - { - return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addDisjunction(source, StatisticRange.fromRange(range)); - } - - private static DisjointRangeDomainHistogram conjunction(ConnectorHistogram source, Range range) - { - return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addConjunction(source, StatisticRange.fromRange(range)); - } - - private static List> ranges(DisjointRangeDomainHistogram hist) - { - return hist.getRanges().stream().map(StatisticRange::toRange).collect(Collectors.toList()); - } - - private static class StandardNormalHistogram - implements ConnectorHistogram - { - private final NormalDistribution distribution = new NormalDistribution(); - - public NormalDistribution getDistribution() - { - return distribution; - } - - @Override - public Estimate cumulativeProbability(double value, boolean inclusive) - { - return Estimate.of(distribution.cumulativeProbability(value)); - } - - @Override - public Estimate inverseCumulativeProbability(double percentile) - { - // assume lower/upper limit is 10, in order to not throw - // exception, even though technically the bounds are technically - // INF - if (percentile <= 0.0) { - return Estimate.of(-10); - } - if (percentile >= 1.0) { - return Estimate.of(10); - } - return Estimate.of(distribution.inverseCumulativeProbability(percentile)); - } - } - - @Override - ConnectorHistogram createHistogram() - { - RealDistribution distribution = getDistribution(); - return new DisjointRangeDomainHistogram( - new UniformDistributionHistogram( - distribution.getSupportLowerBound(), distribution.getSupportUpperBound())) - .addDisjunction(new StatisticRange(0.0, 100.0, 0.0)); - } - - @Override - double getDistinctValues() - { - return 100; - } - - @Override - RealDistribution getDistribution() - { - return new UniformRealDistribution(0.0, 100.0); - } - - /** - * Support depends on the underlying distribution. - */ - @Override - public void testInclusiveExclusive() - { - } -} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/AbstractTestFilterStatsCalculator.java b/presto-main/src/test/java/com/facebook/presto/cost/TestFilterStatsCalculator.java similarity index 97% rename from presto-main/src/test/java/com/facebook/presto/cost/AbstractTestFilterStatsCalculator.java rename to presto-main/src/test/java/com/facebook/presto/cost/TestFilterStatsCalculator.java index c8f919326f92d..50528418e55c8 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/AbstractTestFilterStatsCalculator.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestFilterStatsCalculator.java @@ -27,7 +27,6 @@ import java.util.Optional; -import static com.facebook.presto.SystemSessionProperties.OPTIMIZER_USE_HISTOGRAMS; import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static com.facebook.presto.sql.planner.iterative.rule.test.PlanBuilder.expression; import static com.facebook.presto.testing.TestingSession.testSessionBuilder; @@ -37,7 +36,7 @@ import static java.lang.String.format; import static org.testng.Assert.assertEquals; -public abstract class AbstractTestFilterStatsCalculator +public class TestFilterStatsCalculator { private static final VarcharType MEDIUM_VARCHAR_TYPE = VarcharType.createVarcharType(100); @@ -55,13 +54,6 @@ public abstract class AbstractTestFilterStatsCalculator private Session session; private TestingRowExpressionTranslator translator; - public AbstractTestFilterStatsCalculator(boolean withHistograms) - { - session = testSessionBuilder() - .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, Boolean.toString(withHistograms)) - .build(); - } - @BeforeClass public void setUp() throws Exception @@ -145,6 +137,7 @@ public void setUp() .add(new VariableReferenceExpression(Optional.empty(), "mediumVarchar", MEDIUM_VARCHAR_TYPE)) .build()); + session = testSessionBuilder().build(); MetadataManager metadata = MetadataManager.createTestMetadataManager(); statsCalculator = new FilterStatsCalculator(metadata, new ScalarStatsCalculator(metadata), new StatsNormalizer()); translator = new TestingRowExpressionTranslator(MetadataManager.createTestMetadataManager()); @@ -383,33 +376,6 @@ public void testIsNotNullFilter() .variableStats(new VariableReferenceExpression(Optional.empty(), "emptyRange", DOUBLE), VariableStatsAssertion::empty); } - @Test - public void testBetweenOperatorFilterLeftOpen() - { - // Left side open, cut on open side - assertExpression("leftOpen BETWEEN DOUBLE '-10' AND 10e0") - .outputRowsCount(180.0) - .variableStats(new VariableReferenceExpression(Optional.empty(), "leftOpen", DOUBLE), variableStats -> - variableStats.distinctValuesCount(10.0) - .lowValue(-10.0) - .highValue(10.0) - .nullsFraction(0.0)); - } - - @Test - public void testBetweenOperatorFilterRightOpen() - { - // Left side open, cut on open side - // Right side open, cut on open side - assertExpression("rightOpen BETWEEN DOUBLE '-10' AND 10e0") - .outputRowsCount(180.0) - .variableStats(new VariableReferenceExpression(Optional.empty(), "rightOpen", DOUBLE), variableStats -> - variableStats.distinctValuesCount(10.0) - .lowValue(-10.0) - .highValue(10.0) - .nullsFraction(0.0)); - } - @Test public void testBetweenOperatorFilter() { @@ -456,6 +422,24 @@ public void testBetweenOperatorFilter() .highValue(3.14) .nullsFraction(0.0)); + // Left side open, cut on open side + assertExpression("leftOpen BETWEEN DOUBLE '-10' AND 10e0") + .outputRowsCount(180.0) + .variableStats(new VariableReferenceExpression(Optional.empty(), "leftOpen", DOUBLE), variableStats -> + variableStats.distinctValuesCount(10.0) + .lowValue(-10.0) + .highValue(10.0) + .nullsFraction(0.0)); + + // Right side open, cut on open side + assertExpression("rightOpen BETWEEN DOUBLE '-10' AND 10e0") + .outputRowsCount(180.0) + .variableStats(new VariableReferenceExpression(Optional.empty(), "rightOpen", DOUBLE), variableStats -> + variableStats.distinctValuesCount(10.0) + .lowValue(-10.0) + .highValue(10.0) + .nullsFraction(0.0)); + // Filter all assertExpression("y BETWEEN 27.5e0 AND 107e0") .outputRowsCount(0.0) @@ -604,7 +588,7 @@ public void testInPredicateFilter() .nullsFraction(0.0)); } - protected PlanNodeStatsAssertion assertExpression(String expression) + private PlanNodeStatsAssertion assertExpression(String expression) { return assertExpression(expression(expression)); } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestFilterStatsCalculatorHistograms.java b/presto-main/src/test/java/com/facebook/presto/cost/TestFilterStatsCalculatorHistograms.java deleted file mode 100644 index 2ec593f07c9b1..0000000000000 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestFilterStatsCalculatorHistograms.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.facebook.presto.cost; - -import com.facebook.presto.spi.relation.VariableReferenceExpression; -import org.testng.annotations.Test; - -import java.util.Optional; - -import static com.facebook.presto.common.type.DoubleType.DOUBLE; - -public class TestFilterStatsCalculatorHistograms - extends AbstractTestFilterStatsCalculator -{ - public TestFilterStatsCalculatorHistograms() - { - super(true); - } - - /** - * We override this test because the original logic utilizes heuristics in cases where stats - * don't exist and infinite bound exists. We choose to use slightly different heuristics - * for these cases when histograms are enabled. - *
- * See {@link StatisticRange#overlapPercentWith(StatisticRange)} - */ - @Test - public void testBetweenOperatorFilterLeftOpen() - { - assertExpression("leftOpen BETWEEN DOUBLE '-10' AND 10e0") - .outputRowsCount(56.25) - .variableStats(new VariableReferenceExpression(Optional.empty(), "leftOpen", DOUBLE), variableStats -> - variableStats.distinctValuesCount(10.0) - .lowValue(-10.0) - .highValue(10.0) - .nullsFraction(0.0)); - } - - /** - * We override this test because the original logic utilizes heuristics in cases where stats - * don't exist and infinite bound exists. We choose to use slightly different heuristics - * for these cases when histograms are enabled. - *
- * See {@link StatisticRange#overlapPercentWith(StatisticRange)} - */ - @Test - public void testBetweenOperatorFilterRightOpen() - { - // Left side open, cut on open side - // Right side open, cut on open side - assertExpression("rightOpen BETWEEN DOUBLE '-10' AND 10e0") - .outputRowsCount(56.25) - .variableStats(new VariableReferenceExpression(Optional.empty(), "rightOpen", DOUBLE), variableStats -> - variableStats.distinctValuesCount(10.0) - .lowValue(-10.0) - .highValue(10.0) - .nullsFraction(0.0)); - } -} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestFilterStatsCalculatorNoHistograms.java b/presto-main/src/test/java/com/facebook/presto/cost/TestFilterStatsCalculatorNoHistograms.java deleted file mode 100644 index f101def2d8ce2..0000000000000 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestFilterStatsCalculatorNoHistograms.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.facebook.presto.cost; - -public class TestFilterStatsCalculatorNoHistograms - extends AbstractTestFilterStatsCalculator -{ - public TestFilterStatsCalculatorNoHistograms() - { - super(false); - } -} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java b/presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java deleted file mode 100644 index 26c68b7e5730e..0000000000000 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.cost; - -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import org.apache.commons.math3.distribution.RealDistribution; -import org.testng.annotations.Test; - -import static java.lang.Double.NEGATIVE_INFINITY; -import static java.lang.Double.POSITIVE_INFINITY; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertThrows; -import static org.testng.Assert.assertTrue; - -public abstract class TestHistogram -{ - abstract ConnectorHistogram createHistogram(); - - abstract RealDistribution getDistribution(); - - abstract double getDistinctValues(); - - @Test - public void testInverseCumulativeProbability() - { - ConnectorHistogram hist = createHistogram(); - RealDistribution dist = getDistribution(); - assertThrows(IllegalArgumentException.class, () -> hist.inverseCumulativeProbability(Double.NaN)); - assertThrows(IllegalArgumentException.class, () -> hist.inverseCumulativeProbability(-1.0)); - assertThrows(IllegalArgumentException.class, () -> hist.inverseCumulativeProbability(2.0)); - assertEquals(hist.inverseCumulativeProbability(0.0).getValue(), dist.getSupportLowerBound(), .001); - assertEquals(hist.inverseCumulativeProbability(0.25).getValue(), dist.inverseCumulativeProbability(0.25), .001); - assertEquals(hist.inverseCumulativeProbability(0.5).getValue(), dist.getNumericalMean(), .001); - assertEquals(hist.inverseCumulativeProbability(1.0).getValue(), dist.getSupportUpperBound(), .001); - } - - @Test - public void testCumulativeProbability() - { - ConnectorHistogram hist = createHistogram(); - RealDistribution dist = getDistribution(); - - assertTrue(hist.cumulativeProbability(Double.NaN, true).isUnknown()); - assertEquals(hist.cumulativeProbability(NEGATIVE_INFINITY, true).getValue(), 0.0, .001); - assertEquals(hist.cumulativeProbability(NEGATIVE_INFINITY, false).getValue(), 0.0, .001); - assertEquals(hist.cumulativeProbability(POSITIVE_INFINITY, true).getValue(), 1.0, .001); - assertEquals(hist.cumulativeProbability(POSITIVE_INFINITY, false).getValue(), 1.0, .001); - - assertEquals(hist.cumulativeProbability(dist.getSupportLowerBound() - 1, true).getValue(), 0.0, .001); - assertEquals(hist.cumulativeProbability(dist.getSupportLowerBound(), true).getValue(), 0.0, .001); - assertEquals(hist.cumulativeProbability(dist.getSupportUpperBound() + 1, true).getValue(), 1.0, .001); - assertEquals(hist.cumulativeProbability(dist.getSupportUpperBound(), true).getValue(), 1.0, .001); - assertEquals(hist.cumulativeProbability(dist.getNumericalMean(), true).getValue(), 0.5, .001); - for (int i = 0; i < 10; i++) { - assertEquals(hist.cumulativeProbability(dist.inverseCumulativeProbability(0.1 * i), true).getValue(), dist.cumulativeProbability(dist.inverseCumulativeProbability(0.1 * i)), .001); - } - } - - @Test - public void testInclusiveExclusive() - { - double ndvs = getDistinctValues(); - ConnectorHistogram hist = createHistogram(); - // test maximums - assertEquals(hist.cumulativeProbability(hist.inverseCumulativeProbability(1.0).getValue(), false).getValue(), 1.0 - (1.0 / ndvs), .0001); - assertEquals(hist.cumulativeProbability(hist.inverseCumulativeProbability(1.0).getValue(), true).getValue(), 1.0, .0001); - - // test minimums - assertEquals(hist.cumulativeProbability(hist.inverseCumulativeProbability(0.0).getValue(), false).getValue(), 0.0, .0001); - assertEquals(hist.cumulativeProbability(hist.inverseCumulativeProbability(0.0).getValue(), true).getValue(), 0.0, .0001); - - // test non-max/min - double midPercent = hist.inverseCumulativeProbability(0.5).getValue(); - assertEquals(hist.cumulativeProbability(midPercent, true).getValue() - hist.cumulativeProbability(midPercent, false).getValue(), 1.0 / ndvs, .0001); - } -} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java b/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java deleted file mode 100644 index ddccfdfe3c065..0000000000000 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.cost; - -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import org.testng.annotations.Test; - -import static com.facebook.presto.cost.HistogramCalculator.calculateFilterFactor; -import static java.lang.Double.NEGATIVE_INFINITY; -import static java.lang.Double.NaN; -import static java.lang.Double.POSITIVE_INFINITY; -import static org.testng.Assert.assertEquals; - -public class TestHistogramCalculator -{ - @Test - public void testCalculateFilterFactor() - { - StatisticRange zeroToTen = range(0, 10, 10); - StatisticRange empty = StatisticRange.empty(); - - // Equal ranges - assertFilterFactor(Estimate.of(1.0), zeroToTen, uniformHist(0, 10), 5); - assertFilterFactor(Estimate.of(1.0), zeroToTen, uniformHist(0, 10), 20); - - // Some overlap - assertFilterFactor(Estimate.of(0.5), range(5, 3000, 5), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // Single value overlap - assertFilterFactor(Estimate.of(1.0 / zeroToTen.getDistinctValuesCount()), range(3, 3, 1), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - assertFilterFactor(Estimate.of(1.0 / zeroToTen.getDistinctValuesCount()), range(10, 100, 357), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // No overlap - assertFilterFactor(Estimate.zero(), range(20, 30, 10), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // Empty ranges - assertFilterFactor(Estimate.zero(), zeroToTen, uniformHist(empty), empty.getDistinctValuesCount()); - assertFilterFactor(Estimate.zero(), empty, uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // no test for (empty, empty) since any return value is correct - assertFilterFactor(Estimate.zero(), unboundedRange(10), uniformHist(empty), empty.getDistinctValuesCount()); - assertFilterFactor(Estimate.zero(), empty, uniformHist(unboundedRange(10)), 10); - - // Unbounded (infinite), NDV-based - assertFilterFactor(Estimate.of(0.5), unboundedRange(10), uniformHist(unboundedRange(20)), 20); - assertFilterFactor(Estimate.of(1.0), unboundedRange(20), uniformHist(unboundedRange(10)), 10); - - // NEW TESTS (TPC-H Q2) - // unbounded ranges - assertFilterFactor(Estimate.of(.5), unboundedRange(0.5), uniformHist(unboundedRange(NaN)), NaN); - // unbounded ranges with limited distinct values - assertFilterFactor(Estimate.of(0.2), unboundedRange(1.0), - domainConstrained(unboundedRange(5.0), uniformHist(unboundedRange(7.0))), 5.0); - } - - private static StatisticRange range(double low, double high, double distinctValues) - { - return new StatisticRange(low, high, distinctValues); - } - - private static StatisticRange unboundedRange(double distinctValues) - { - return new StatisticRange(NEGATIVE_INFINITY, POSITIVE_INFINITY, distinctValues); - } - - private static void assertFilterFactor(Estimate expected, StatisticRange range, ConnectorHistogram histogram, double totalDistinctValues) - { - assertEquals( - calculateFilterFactor(range, histogram, Estimate.estimateFromDouble(totalDistinctValues), true), - expected); - } - - private static ConnectorHistogram uniformHist(StatisticRange range) - { - return uniformHist(range.getLow(), range.getHigh()); - } - - private static ConnectorHistogram uniformHist(double low, double high) - { - return new UniformDistributionHistogram(low, high); - } - - private static ConnectorHistogram domainConstrained(StatisticRange range, ConnectorHistogram source) - { - return DisjointRangeDomainHistogram.addDisjunction(source, range); - } -} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java b/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java index f259e3da95893..ba67a40f3054d 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java @@ -14,11 +14,9 @@ package com.facebook.presto.cost; import com.facebook.presto.spi.relation.VariableReferenceExpression; -import com.facebook.presto.spi.statistics.ConnectorHistogram; import org.testng.annotations.Test; import java.util.Optional; -import java.util.function.BiFunction; import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.cost.PlanNodeStatsEstimateMath.addStatsAndMaxDistinctValues; @@ -358,62 +356,6 @@ private static void assertCapNullsFraction(PlanNodeStatsEstimate stats, PlanNode assertEquals(capStats(stats, cap).getVariableStatistics(VARIABLE).getNullsFraction(), expected); } - @Test - public void testAddHistograms() - { - StatisticRange zeroToTen = new StatisticRange(0, 10, 1); - StatisticRange zeroToFive = new StatisticRange(0, 5, 1); - StatisticRange fiveToTen = new StatisticRange(5, 10, 1); - StatisticRange threeToSeven = new StatisticRange(3, 7, 1); - - PlanNodeStatsEstimate unknownRowCount = statistics(NaN, NaN, NaN, NaN, zeroToTen); - PlanNodeStatsEstimate unknownNullsFraction = statistics(10, NaN, NaN, NaN, zeroToTen); - PlanNodeStatsEstimate first = statistics(50, NaN, 0.25, NaN, zeroToTen); - PlanNodeStatsEstimate second = statistics(25, NaN, 0.6, NaN, zeroToFive); - PlanNodeStatsEstimate third = statistics(25, NaN, 0.6, NaN, fiveToTen); - PlanNodeStatsEstimate fourth = statistics(20, NaN, 0.6, NaN, threeToSeven); - ConnectorHistogram nanHistogram = VariableStatsEstimate.unknown().getHistogram(); - - // histogram should be full open range when row counts are unknown - assertAddStatsHistogram(unknownRowCount, unknownRowCount, PlanNodeStatsEstimateMath::addStatsAndCollapseDistinctValues, nanHistogram); - - // check when rows are available histograms are added properly. - ConnectorHistogram addedSameRange = DisjointRangeDomainHistogram.addDisjunction(unknownNullsFraction.getVariableStatistics(VARIABLE).getHistogram(), zeroToTen); - assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, PlanNodeStatsEstimateMath::addStatsAndSumDistinctValues, addedSameRange); - assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, PlanNodeStatsEstimateMath::addStatsAndCollapseDistinctValues, addedSameRange); - assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, PlanNodeStatsEstimateMath::addStatsAndMaxDistinctValues, addedSameRange); - assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, PlanNodeStatsEstimateMath::addStatsAndIntersect, addedSameRange); - - // check when only a sub-range is added, that the histogram still represents the full range - ConnectorHistogram fullRangeFirst = DisjointRangeDomainHistogram.addDisjunction(first.getVariableStatistics(VARIABLE).getHistogram(), zeroToTen); - ConnectorHistogram intersectedRangeSecond = DisjointRangeDomainHistogram.addConjunction(first.getVariableStatistics(VARIABLE).getHistogram(), zeroToFive); - assertAddStatsHistogram(first, second, PlanNodeStatsEstimateMath::addStatsAndSumDistinctValues, fullRangeFirst); - assertAddStatsHistogram(first, second, PlanNodeStatsEstimateMath::addStatsAndCollapseDistinctValues, fullRangeFirst); - assertAddStatsHistogram(first, second, PlanNodeStatsEstimateMath::addStatsAndMaxDistinctValues, fullRangeFirst); - assertAddStatsHistogram(first, second, PlanNodeStatsEstimateMath::addStatsAndIntersect, intersectedRangeSecond); - - // check when two ranges overlap, the new stats span both ranges - ConnectorHistogram fullRangeSecondThird = DisjointRangeDomainHistogram.addDisjunction(second.getVariableStatistics(VARIABLE).getHistogram(), fiveToTen); - ConnectorHistogram intersectedRangeSecondThird = DisjointRangeDomainHistogram.addConjunction(second.getVariableStatistics(VARIABLE).getHistogram(), fiveToTen); - assertAddStatsHistogram(second, third, PlanNodeStatsEstimateMath::addStatsAndSumDistinctValues, fullRangeSecondThird); - assertAddStatsHistogram(second, third, PlanNodeStatsEstimateMath::addStatsAndCollapseDistinctValues, fullRangeSecondThird); - assertAddStatsHistogram(second, third, PlanNodeStatsEstimateMath::addStatsAndMaxDistinctValues, fullRangeSecondThird); - assertAddStatsHistogram(second, third, PlanNodeStatsEstimateMath::addStatsAndIntersect, intersectedRangeSecondThird); - - // check when two ranges partially overlap, the addition/intersection is applied correctly - ConnectorHistogram fullRangeThirdFourth = DisjointRangeDomainHistogram.addDisjunction(third.getVariableStatistics(VARIABLE).getHistogram(), threeToSeven); - ConnectorHistogram intersectedRangeThirdFourth = DisjointRangeDomainHistogram.addConjunction(third.getVariableStatistics(VARIABLE).getHistogram(), threeToSeven); - assertAddStatsHistogram(third, fourth, PlanNodeStatsEstimateMath::addStatsAndSumDistinctValues, fullRangeThirdFourth); - assertAddStatsHistogram(third, fourth, PlanNodeStatsEstimateMath::addStatsAndCollapseDistinctValues, fullRangeThirdFourth); - assertAddStatsHistogram(third, fourth, PlanNodeStatsEstimateMath::addStatsAndMaxDistinctValues, fullRangeThirdFourth); - assertAddStatsHistogram(third, fourth, PlanNodeStatsEstimateMath::addStatsAndIntersect, intersectedRangeThirdFourth); - } - - private static void assertAddStatsHistogram(PlanNodeStatsEstimate first, PlanNodeStatsEstimate second, BiFunction function, ConnectorHistogram expected) - { - assertEquals(function.apply(first, second).getVariableStatistics(VARIABLE).getHistogram(), expected); - } - private static PlanNodeStatsEstimate statistics(double rowCount, double totalSize, double nullsFraction, double averageRowSize, StatisticRange range) { return PlanNodeStatsEstimate.builder() @@ -423,7 +365,6 @@ private static PlanNodeStatsEstimate statistics(double rowCount, double totalSiz .setNullsFraction(nullsFraction) .setAverageRowSize(averageRowSize) .setStatisticsRange(range) - .setHistogram(DisjointRangeDomainHistogram.addConjunction(new UniformDistributionHistogram(range.getLow(), range.getHigh()), range)) .build()) .build(); } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestStatisticRange.java b/presto-main/src/test/java/com/facebook/presto/cost/TestStatisticRange.java index 5975237dde8d0..d88780ecf8ccc 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestStatisticRange.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestStatisticRange.java @@ -20,8 +20,6 @@ import static java.lang.Double.NaN; import static java.lang.Double.POSITIVE_INFINITY; import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; public class TestStatisticRange { @@ -106,66 +104,11 @@ public void testAddAndCollapseDistinctValues() assertEquals(range(0, 3, 3).addAndCollapseDistinctValues(range(2, 6, 4)), range(0, 6, 6)); } - @Test - public void testIntersectOpenness() - { - StatisticRange first = range(0, true, 10, true, 10); - StatisticRange second = range(0, true, 5, true, 5); - StatisticRange intersect = first.intersect(second); - assertTrue(intersect.getOpenLow()); - assertTrue(intersect.getOpenHigh()); - intersect = second.intersect(first); - assertTrue(intersect.getOpenLow()); - assertTrue(intersect.getOpenHigh()); - - // second range bounds only on high - second = range(-1, true, 5, false, 5); - intersect = first.intersect(second); - assertTrue(intersect.getOpenLow()); - assertFalse(intersect.getOpenHigh()); - intersect = second.intersect(first); - assertTrue(intersect.getOpenLow()); - assertFalse(intersect.getOpenHigh()); - - // second range bounds on low and high - second = range(1, false, 5, false, 5); - intersect = first.intersect(second); - assertFalse(intersect.getOpenLow()); - assertFalse(intersect.getOpenHigh()); - intersect = second.intersect(first); - assertFalse(intersect.getOpenLow()); - assertFalse(intersect.getOpenHigh()); - - // second range bounds only on low - second = range(1, false, 5, true, 5); - intersect = first.intersect(second); - assertFalse(intersect.getOpenLow()); - assertTrue(intersect.getOpenHigh()); - intersect = second.intersect(first); - assertFalse(intersect.getOpenLow()); - assertTrue(intersect.getOpenHigh()); - - // same bounds but one is open and one is closed - first = range(0, false, 5, false, 5); - second = range(0, true, 5, true, 5); - intersect = first.intersect(second); - assertTrue(intersect.getOpenLow()); - assertTrue(intersect.getOpenHigh()); - intersect = second.intersect(first); - assertTrue(intersect.getOpenLow()); - assertTrue(intersect.getOpenHigh()); - } - private static StatisticRange range(double low, double high, double distinctValues) { return new StatisticRange(low, high, distinctValues); } - private static StatisticRange range(double low, boolean openLow, double high, boolean openHigh, double distinctValues) - { - return new StatisticRange(low, openLow, high, openHigh, distinctValues); - } - private static StatisticRange unboundedRange(double distinctValues) { return new StatisticRange(NEGATIVE_INFINITY, POSITIVE_INFINITY, distinctValues); diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java b/presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java deleted file mode 100644 index 395bc3f6e7518..0000000000000 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.cost; - -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import com.google.common.base.VerifyException; -import org.apache.commons.math3.distribution.RealDistribution; -import org.apache.commons.math3.distribution.UniformRealDistribution; -import org.testng.annotations.Test; - -import static java.lang.Double.POSITIVE_INFINITY; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertThrows; -import static org.testng.Assert.assertTrue; - -public class TestUniformHistogram - extends TestHistogram -{ - ConnectorHistogram createHistogram() - { - return new UniformDistributionHistogram(0, 1); - } - - RealDistribution getDistribution() - { - return new UniformRealDistribution(); - } - - @Override - double getDistinctValues() - { - return 100; - } - - @Test - public void testInvalidConstruction() - { - assertThrows(VerifyException.class, () -> new UniformDistributionHistogram(2.0, 1.0)); - } - - @Test - public void testNanRangeValues() - { - ConnectorHistogram hist = new UniformDistributionHistogram(Double.NaN, 2); - assertTrue(hist.inverseCumulativeProbability(0.5).isUnknown()); - - hist = new UniformDistributionHistogram(1.0, Double.NaN); - assertTrue(hist.inverseCumulativeProbability(0.5).isUnknown()); - - hist = new UniformDistributionHistogram(1.0, 2.0); - assertEquals(hist.inverseCumulativeProbability(0.5).getValue(), 1.5); - } - - @Test - public void testInfiniteRangeValues() - { - // test low value as infinite - ConnectorHistogram hist = new UniformDistributionHistogram(Double.NEGATIVE_INFINITY, 2); - - assertTrue(hist.inverseCumulativeProbability(0.5).isUnknown()); - assertEquals(hist.inverseCumulativeProbability(0.0), Estimate.unknown()); - assertEquals(hist.inverseCumulativeProbability(1.0).getValue(), 2.0); - - assertEquals(hist.cumulativeProbability(0.0, true), Estimate.unknown()); - assertEquals(hist.cumulativeProbability(1.0, true), Estimate.unknown()); - assertEquals(hist.cumulativeProbability(2.0, true).getValue(), 1.0); - assertEquals(hist.cumulativeProbability(2.5, true).getValue(), 1.0); - - // test high value as infinite - hist = new UniformDistributionHistogram(1.0, POSITIVE_INFINITY); - - assertTrue(hist.inverseCumulativeProbability(0.5).isUnknown()); - assertEquals(hist.inverseCumulativeProbability(0.0).getValue(), 1.0); - assertEquals(hist.inverseCumulativeProbability(1.0), Estimate.unknown()); - - assertEquals(hist.cumulativeProbability(0.0, true).getValue(), 0.0); - assertEquals(hist.cumulativeProbability(1.0, true).getValue(), 0.0); - assertEquals(hist.cumulativeProbability(1.5, true), Estimate.unknown()); - } - - @Test - public void testSingleValueRange() - { - UniformDistributionHistogram hist = new UniformDistributionHistogram(1.0, 1.0); - - assertEquals(hist.inverseCumulativeProbability(0.0).getValue(), 1.0); - assertEquals(hist.inverseCumulativeProbability(1.0).getValue(), 1.0); - assertEquals(hist.inverseCumulativeProbability(0.5).getValue(), 1.0); - - assertEquals(hist.cumulativeProbability(0.0, true).getValue(), 0.0); - assertEquals(hist.cumulativeProbability(0.5, true).getValue(), 0.0); - assertEquals(hist.cumulativeProbability(1.0, true).getValue(), 1.0); - assertEquals(hist.cumulativeProbability(1.5, true).getValue(), 1.0); - } - - /** - * {@link UniformDistributionHistogram} does not support the inclusive/exclusive arguments - */ - @Override - public void testInclusiveExclusive() - { - } -} diff --git a/presto-main/src/test/java/com/facebook/presto/operator/scalar/queryplan/TestJsonPrestoQueryPlanFunctionUtils.java b/presto-main/src/test/java/com/facebook/presto/operator/scalar/queryplan/TestJsonPrestoQueryPlanFunctionUtils.java index 603e8353ccec5..0c77a04d046e0 100644 --- a/presto-main/src/test/java/com/facebook/presto/operator/scalar/queryplan/TestJsonPrestoQueryPlanFunctionUtils.java +++ b/presto-main/src/test/java/com/facebook/presto/operator/scalar/queryplan/TestJsonPrestoQueryPlanFunctionUtils.java @@ -46,36 +46,21 @@ private TestJsonPrestoQueryPlanFunctionUtils() {} " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"b\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"b_1\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " }\n" + " },\n" + " \"joinNodeStatsEstimate\" : {\n" + @@ -133,36 +118,21 @@ private TestJsonPrestoQueryPlanFunctionUtils() {} " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"a_0\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"b_1\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " }\n" + " },\n" + " \"joinNodeStatsEstimate\" : {\n" + @@ -191,36 +161,21 @@ private TestJsonPrestoQueryPlanFunctionUtils() {} " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"b\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"b_1\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " }\n" + " },\n" + " \"joinNodeStatsEstimate\" : {\n" + @@ -257,24 +212,14 @@ private TestJsonPrestoQueryPlanFunctionUtils() {} " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"b\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " }\n" + " },\n" + " \"joinNodeStatsEstimate\" : {\n" + @@ -300,36 +245,21 @@ private TestJsonPrestoQueryPlanFunctionUtils() {} " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"a\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"b\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " }\n" + " },\n" + " \"joinNodeStatsEstimate\" : {\n" + @@ -366,24 +296,14 @@ private TestJsonPrestoQueryPlanFunctionUtils() {} " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"b_1\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " }\n" + " },\n" + " \"joinNodeStatsEstimate\" : {\n" + @@ -409,36 +329,21 @@ private TestJsonPrestoQueryPlanFunctionUtils() {} " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"a_0\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " },\n" + " \"b_1\" : {\n" + " \"lowValue\" : \"NaN\",\n" + " \"highValue\" : \"NaN\",\n" + " \"nullsFraction\" : 1.0,\n" + " \"averageRowSize\" : 0.0,\n" + - " \"distinctValuesCount\" : 0.0,\n" + - " \"histogram\" : {\n" + - " \"@class\" : \"com.facebook.presto.cost.UniformDistributionHistogram\",\n" + - " \"lowValue\" : \"NaN\",\n" + - " \"highValue\" : \"NaN\"\n" + - " }\n" + + " \"distinctValuesCount\" : 0.0\n" + " }\n" + " },\n" + " \"joinNodeStatsEstimate\" : {\n" + diff --git a/presto-main/src/test/java/com/facebook/presto/sql/analyzer/TestFeaturesConfig.java b/presto-main/src/test/java/com/facebook/presto/sql/analyzer/TestFeaturesConfig.java index 2ab3965ccdf43..ce9d8a8673b5e 100644 --- a/presto-main/src/test/java/com/facebook/presto/sql/analyzer/TestFeaturesConfig.java +++ b/presto-main/src/test/java/com/facebook/presto/sql/analyzer/TestFeaturesConfig.java @@ -270,7 +270,6 @@ public void testDefaults() .setDefaultWriterReplicationCoefficient(3.0) .setDefaultViewSecurityMode(DEFINER) .setCteHeuristicReplicationThreshold(4) - .setUseHistograms(false) .setLegacyJsonCast(true)); } @@ -486,7 +485,6 @@ public void testExplicitPropertyMappings() .put("optimizer.default-writer-replication-coefficient", "5.0") .put("default-view-security-mode", INVOKER.name()) .put("cte-heuristic-replication-threshold", "2") - .put("optimizer.use-histograms", "true") .build(); FeaturesConfig expected = new FeaturesConfig() @@ -698,7 +696,6 @@ public void testExplicitPropertyMappings() .setDefaultWriterReplicationCoefficient(5.0) .setDefaultViewSecurityMode(INVOKER) .setCteHeuristicReplicationThreshold(2) - .setUseHistograms(true) .setLegacyJsonCast(false); assertFullMapping(properties, expected); } diff --git a/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/BasePlanTest.java b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/BasePlanTest.java index 2d82e49863fba..df7941cf63f89 100644 --- a/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/BasePlanTest.java +++ b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/BasePlanTest.java @@ -280,14 +280,9 @@ protected Plan plan(String sql, Optimizer.PlanStage stage) } protected Plan plan(String sql, Optimizer.PlanStage stage, boolean forceSingleNode) - { - return plan(queryRunner.getDefaultSession(), sql, stage, forceSingleNode); - } - - protected Plan plan(Session session, String sql, Optimizer.PlanStage stage, boolean forceSingleNode) { try { - return queryRunner.inTransaction(session, transactionSession -> queryRunner.createPlan(transactionSession, sql, stage, forceSingleNode, WarningCollector.NOOP)); + return queryRunner.inTransaction(transactionSession -> queryRunner.createPlan(transactionSession, sql, stage, forceSingleNode, WarningCollector.NOOP)); } catch (RuntimeException e) { throw new AssertionError("Planning failed for SQL: " + sql, e); diff --git a/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestNativeGeneralQueries.java b/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestNativeGeneralQueries.java index bd4f8afe34ae3..9d550991da7ba 100644 --- a/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestNativeGeneralQueries.java +++ b/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestNativeGeneralQueries.java @@ -258,10 +258,10 @@ public void testAnalyzeStats() // column_name | data_size | distinct_values_count | nulls_fraction | row_count | low_value | high_value assertQuery("SHOW STATS FOR region", "SELECT * FROM (VALUES" + - "('regionkey', NULL, 5.0, 0.0, NULL, '0', '4', NULL)," + - "('name', 54.0, 5.0, 0.0, NULL, NULL, NULL, NULL)," + - "('comment', 350.0, 5.0, 0.0, NULL, NULL, NULL, NULL)," + - "(NULL, NULL, NULL, NULL, 5.0, NULL, NULL, NULL))"); + "('regionkey', NULL, 5.0, 0.0, NULL, '0', '4')," + + "('name', 54.0, 5.0, 0.0, NULL, NULL, NULL)," + + "('comment', 350.0, 5.0, 0.0, NULL, NULL, NULL)," + + "(NULL, NULL, NULL, NULL, 5.0, NULL, NULL))"); // Create a partitioned table and run analyze on it. String tmpTableName = generateRandomTableName(); @@ -275,17 +275,17 @@ public void testAnalyzeStats() assertUpdate(String.format("ANALYZE %s", tmpTableName), 25); assertQuery(String.format("SHOW STATS for %s", tmpTableName), "SELECT * FROM (VALUES" + - "('name', 277.0, 1.0, 0.0, NULL, NULL, NULL, NULL)," + - "('regionkey', NULL, 5.0, 0.0, NULL, '0', '4', NULL)," + - "('nationkey', NULL, 25.0, 0.0, NULL, '0', '24', NULL)," + - "(NULL, NULL, NULL, NULL, 25.0, NULL, NULL, NULL))"); + "('name', 277.0, 1.0, 0.0, NULL, NULL, NULL)," + + "('regionkey', NULL, 5.0, 0.0, NULL, '0', '4')," + + "('nationkey', NULL, 25.0, 0.0, NULL, '0', '24')," + + "(NULL, NULL, NULL, NULL, 25.0, NULL, NULL))"); assertUpdate(String.format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['0','0'],ARRAY['4', '11']])", tmpTableName), 2); assertQuery(String.format("SHOW STATS for (SELECT * FROM %s where regionkey=4 and nationkey=11)", tmpTableName), "SELECT * FROM (VALUES" + - "('name', 8.0, 1.0, 0.0, NULL, NULL, NULL, NULL)," + - "('regionkey', NULL, 1.0, 0.0, NULL, '4', '4', NULL)," + - "('nationkey', NULL, 1.0, 0.0, NULL, '11', '11', NULL)," + - "(NULL, NULL, NULL, NULL, 1.0, NULL, NULL, NULL))"); + "('name', 8.0, 1.0, 0.0, NULL, NULL, NULL)," + + "('regionkey', NULL, 1.0, 0.0, NULL, '4', '4')," + + "('nationkey', NULL, 1.0, 0.0, NULL, '11', '11')," + + "(NULL, NULL, NULL, NULL, 1.0, NULL, NULL))"); } finally { dropTableIfExists(tmpTableName); @@ -305,9 +305,9 @@ public void testAnalyzeStatsOnDecimals() assertUpdate(String.format("ANALYZE %s", tmpTableName), 7); assertQuery(String.format("SHOW STATS for %s", tmpTableName), "SELECT * FROM (VALUES" + - "('c0', NULL,4.0 , 0.2857142857142857, NULL, '-542392.89', '1000000.12', NULL)," + - "('c1', NULL,4.0 , 0.2857142857142857, NULL, '-6.72398239210929E12', '2.823982323232357E13', NULL)," + - "(NULL, NULL, NULL, NULL, 7.0, NULL, NULL, NULL))"); + "('c0', NULL,4.0 , 0.2857142857142857, NULL, '-542392.89', '1000000.12')," + + "('c1', NULL,4.0 , 0.2857142857142857, NULL, '-6.72398239210929E12', '2.823982323232357E13')," + + "(NULL, NULL, NULL, NULL, 7.0, NULL, NULL))"); } finally { dropTableIfExists(tmpTableName); diff --git a/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestWriter.java b/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestWriter.java index eb447fdab8270..441297f4ddaf2 100644 --- a/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestWriter.java +++ b/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestWriter.java @@ -279,36 +279,36 @@ public void testCollectColumnStatisticsOnCreateTable() assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1')", tmpTableName), "SELECT * FROM (VALUES " + - "('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1', null), " + - "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2', null), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8.0 - "('c_array', 184.0E0, null, 0.5, null, null, null, null), " + // 176 - "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, h_varchar)"); + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null), " + // 8.0 + "('c_array', 184.0E0, null, 0.5, null, null, null), " + // 176 + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2')", tmpTableName), "SELECT * FROM (VALUES " + - "('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2', null), " + - "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3', null), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8 - "('c_array', 104.0E0, null, 0.5, null, null, null, null), " + // 96 - "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, h_varchar)"); + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null), " + // 8 + "('c_array', 104.0E0, null, 0.5, null, null, null), " + // 96 + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar)"); // non existing partition assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3')", tmpTableName), "SELECT * FROM (VALUES " + - "('c_boolean', null, 0E0, 0E0, null, null, null, null), " + - "('c_bigint', null, 0E0, 0E0, null, null, null, null), " + - "('c_double', null, 0E0, 0E0, null, null, null, null), " + - "('c_timestamp', null, 0E0, 0E0, null, null, null, null), " + - "('c_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + - "('c_array', null, 0E0, 0E0, null, null, null, null), " + - "('p_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + - "(null, null, null, null, 0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, h_varchar)"); + "('c_boolean', null, 0E0, 0E0, null, null, null), " + + "('c_bigint', null, 0E0, 0E0, null, null, null), " + + "('c_double', null, 0E0, 0E0, null, null, null), " + + "('c_timestamp', null, 0E0, 0E0, null, null, null), " + + "('c_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "('c_array', null, 0E0, 0E0, null, null, null), " + + "('p_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "(null, null, null, null, 0E0, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar)"); dropTableIfExists(tmpTableName); } @@ -349,36 +349,36 @@ public void testCollectColumnStatisticsOnInsert() assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1')", tmpTableName), "SELECT * FROM (VALUES " + - "('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1', null), " + - "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2', null), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8 - "('c_array', 184.0E0, null, 0.5E0, null, null, null, null), " + // 176 - "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, p_varchar)"); + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null), " + // 8 + "('c_array', 184.0E0, null, 0.5E0, null, null, null), " + // 176 + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2')", tmpTableName), "SELECT * FROM (VALUES " + - "('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2', null), " + - "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3', null), " + - "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + - "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8 - "('c_array', 104.0E0, null, 0.5, null, null, null, null), " + // 96 - "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, p_varchar)"); + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null), " + // 8 + "('c_array', 104.0E0, null, 0.5, null, null, null), " + // 96 + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar)"); // non existing partition assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3')", tmpTableName), "SELECT * FROM (VALUES " + - "('c_boolean', null, 0E0, 0E0, null, null, null, null), " + - "('c_bigint', null, 0E0, 0E0, null, null, null, null), " + - "('c_double', null, 0E0, 0E0, null, null, null, null), " + - "('c_timestamp', null, 0E0, 0E0, null, null, null, null), " + - "('c_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + - "('c_array', null, 0E0, 0E0, null, null, null, null), " + - "('p_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + - "(null, null, null, null, 0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, p_varchar)"); + "('c_boolean', null, 0E0, 0E0, null, null, null), " + + "('c_bigint', null, 0E0, 0E0, null, null, null), " + + "('c_double', null, 0E0, 0E0, null, null, null), " + + "('c_timestamp', null, 0E0, 0E0, null, null, null), " + + "('c_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "('c_array', null, 0E0, 0E0, null, null, null), " + + "('p_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "(null, null, null, null, 0E0, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar)"); dropTableIfExists(tmpTableName); } diff --git a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestExternalHiveTable.java b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestExternalHiveTable.java index 5a880eb631522..aac3355d38f0d 100644 --- a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestExternalHiveTable.java +++ b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestExternalHiveTable.java @@ -55,19 +55,19 @@ public void testShowStatisticsForExternalTable() onHive().executeQuery("ANALYZE TABLE " + EXTERNAL_TABLE_NAME + " PARTITION (p_regionkey) COMPUTE STATISTICS"); assertThat(query("SHOW STATS FOR " + EXTERNAL_TABLE_NAME)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "1", "1", null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row(null, null, null, null, 5.0, null, null)); onHive().executeQuery("ANALYZE TABLE " + EXTERNAL_TABLE_NAME + " PARTITION (p_regionkey) COMPUTE STATISTICS FOR COLUMNS"); assertThat(query("SHOW STATS FOR " + EXTERNAL_TABLE_NAME)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 38.0, 5.0, 0.0, null, null, null, null), - row("p_comment", 499.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "1", "1", null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row(null, null, null, null, 5.0, null, null)); } @Test diff --git a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java index 59071737e25cc..7562179670e7d 100644 --- a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java +++ b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java @@ -102,58 +102,58 @@ public Requirement getRequirements(Configuration configuration) .build(); private static final List ALL_TYPES_TABLE_STATISTICS = ImmutableList.of( - row("c_tinyint", null, 2.0, 0.0, null, "121", "127", null), - row("c_smallint", null, 2.0, 0.0, null, "32761", "32767", null), - row("c_int", null, 2.0, 0.0, null, "2147483641", "2147483647", null), - row("c_bigint", null, 2.0, 0.0, null, "9223372036854775807", "9223372036854775807", null), - row("c_float", null, 2.0, 0.0, null, "123.341", "123.345", null), - row("c_double", null, 2.0, 0.0, null, "234.561", "235.567", null), - row("c_decimal", null, 2.0, 0.0, null, "345.0", "346.0", null), - row("c_decimal_w_params", null, 2.0, 0.0, null, "345.671", "345.678", null), - row("c_timestamp", null, 2.0, 0.0, null, null, null, null), - row("c_date", null, 2.0, 0.0, null, "2015-05-09", "2015-06-10", null), - row("c_string", 22.0, 2.0, 0.0, null, null, null, null), - row("c_varchar", 20.0, 2.0, 0.0, null, null, null, null), - row("c_char", 12.0, 2.0, 0.0, null, null, null, null), - row("c_boolean", null, 2.0, 0.0, null, null, null, null), - row("c_binary", 23.0, null, 0.0, null, null, null, null), - row(null, null, null, null, 2.0, null, null, null)); + row("c_tinyint", null, 2.0, 0.0, null, "121", "127"), + row("c_smallint", null, 2.0, 0.0, null, "32761", "32767"), + row("c_int", null, 2.0, 0.0, null, "2147483641", "2147483647"), + row("c_bigint", null, 2.0, 0.0, null, "9223372036854775807", "9223372036854775807"), + row("c_float", null, 2.0, 0.0, null, "123.341", "123.345"), + row("c_double", null, 2.0, 0.0, null, "234.561", "235.567"), + row("c_decimal", null, 2.0, 0.0, null, "345.0", "346.0"), + row("c_decimal_w_params", null, 2.0, 0.0, null, "345.671", "345.678"), + row("c_timestamp", null, 2.0, 0.0, null, null, null), + row("c_date", null, 2.0, 0.0, null, "2015-05-09", "2015-06-10"), + row("c_string", 22.0, 2.0, 0.0, null, null, null), + row("c_varchar", 20.0, 2.0, 0.0, null, null, null), + row("c_char", 12.0, 2.0, 0.0, null, null, null), + row("c_boolean", null, 2.0, 0.0, null, null, null), + row("c_binary", 23.0, null, 0.0, null, null, null), + row(null, null, null, null, 2.0, null, null)); private static final List ALL_TYPES_ALL_NULL_TABLE_STATISTICS = ImmutableList.of( - row("c_tinyint", null, 0.0, 1.0, null, null, null, null), - row("c_smallint", null, 0.0, 1.0, null, null, null, null), - row("c_int", null, 0.0, 1.0, null, null, null, null), - row("c_bigint", null, 0.0, 1.0, null, null, null, null), - row("c_float", null, 0.0, 1.0, null, null, null, null), - row("c_double", null, 0.0, 1.0, null, null, null, null), - row("c_decimal", null, 0.0, 1.0, null, null, null, null), - row("c_decimal_w_params", null, 0.0, 1.0, null, null, null, null), - row("c_timestamp", null, 0.0, 1.0, null, null, null, null), - row("c_date", null, 0.0, 1.0, null, null, null, null), - row("c_string", 0.0, 0.0, 1.0, null, null, null, null), - row("c_varchar", 0.0, 0.0, 1.0, null, null, null, null), - row("c_char", 0.0, 0.0, 1.0, null, null, null, null), - row("c_boolean", null, 0.0, 1.0, null, null, null, null), - row("c_binary", 0.0, null, 1.0, null, null, null, null), - row(null, null, null, null, 1.0, null, null, null)); + row("c_tinyint", null, 0.0, 1.0, null, null, null), + row("c_smallint", null, 0.0, 1.0, null, null, null), + row("c_int", null, 0.0, 1.0, null, null, null), + row("c_bigint", null, 0.0, 1.0, null, null, null), + row("c_float", null, 0.0, 1.0, null, null, null), + row("c_double", null, 0.0, 1.0, null, null, null), + row("c_decimal", null, 0.0, 1.0, null, null, null), + row("c_decimal_w_params", null, 0.0, 1.0, null, null, null), + row("c_timestamp", null, 0.0, 1.0, null, null, null), + row("c_date", null, 0.0, 1.0, null, null, null), + row("c_string", 0.0, 0.0, 1.0, null, null, null), + row("c_varchar", 0.0, 0.0, 1.0, null, null, null), + row("c_char", 0.0, 0.0, 1.0, null, null, null), + row("c_boolean", null, 0.0, 1.0, null, null, null), + row("c_binary", 0.0, null, 1.0, null, null, null), + row(null, null, null, null, 1.0, null, null)); private static final List ALL_TYPES_EMPTY_TABLE_STATISTICS = ImmutableList.of( - row("c_tinyint", null, 0.0, 0.0, null, null, null, null), - row("c_smallint", null, 0.0, 0.0, null, null, null, null), - row("c_int", null, 0.0, 0.0, null, null, null, null), - row("c_bigint", null, 0.0, 0.0, null, null, null, null), - row("c_float", null, 0.0, 0.0, null, null, null, null), - row("c_double", null, 0.0, 0.0, null, null, null, null), - row("c_decimal", null, 0.0, 0.0, null, null, null, null), - row("c_decimal_w_params", null, 0.0, 0.0, null, null, null, null), - row("c_timestamp", null, 0.0, 0.0, null, null, null, null), - row("c_date", null, 0.0, 0.0, null, null, null, null), - row("c_string", 0.0, 0.0, 0.0, null, null, null, null), - row("c_varchar", 0.0, 0.0, 0.0, null, null, null, null), - row("c_char", 0.0, 0.0, 0.0, null, null, null, null), - row("c_boolean", null, 0.0, 0.0, null, null, null, null), - row("c_binary", 0.0, null, 0.0, null, null, null, null), - row(null, null, null, null, 0.0, null, null, null)); + row("c_tinyint", null, 0.0, 0.0, null, null, null), + row("c_smallint", null, 0.0, 0.0, null, null, null), + row("c_int", null, 0.0, 0.0, null, null, null), + row("c_bigint", null, 0.0, 0.0, null, null, null), + row("c_float", null, 0.0, 0.0, null, null, null), + row("c_double", null, 0.0, 0.0, null, null, null), + row("c_decimal", null, 0.0, 0.0, null, null, null), + row("c_decimal_w_params", null, 0.0, 0.0, null, null, null), + row("c_timestamp", null, 0.0, 0.0, null, null, null), + row("c_date", null, 0.0, 0.0, null, null, null), + row("c_string", 0.0, 0.0, 0.0, null, null, null), + row("c_varchar", 0.0, 0.0, 0.0, null, null, null), + row("c_char", 0.0, 0.0, 0.0, null, null, null), + row("c_boolean", null, 0.0, 0.0, null, null, null), + row("c_binary", 0.0, null, 0.0, null, null, null), + row(null, null, null, null, 0.0, null, null)); private static final class AllTypesTable implements RequirementsProvider @@ -179,33 +179,33 @@ public void testStatisticsForUnpartitionedTable() // table not analyzed assertThat(query(showStatsWholeTable)).containsOnly( - row("n_nationkey", null, null, anyOf(null, 0.0), null, null, null, null), - row("n_name", null, null, anyOf(null, 0.0), null, null, null, null), - row("n_regionkey", null, null, anyOf(null, 0.0), null, null, null, null), - row("n_comment", null, null, anyOf(null, 0.0), null, null, null, null), - row(null, null, null, null, anyOf(null, 0.0), null, null, null)); // anyOf because of different behaviour on HDP (hive 1.2) and CDH (hive 1.1) + row("n_nationkey", null, null, anyOf(null, 0.0), null, null, null), + row("n_name", null, null, anyOf(null, 0.0), null, null, null), + row("n_regionkey", null, null, anyOf(null, 0.0), null, null, null), + row("n_comment", null, null, anyOf(null, 0.0), null, null, null), + row(null, null, null, null, anyOf(null, 0.0), null, null)); // anyOf because of different behaviour on HDP (hive 1.2) and CDH (hive 1.1) // basic analysis onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " COMPUTE STATISTICS"); assertThat(query(showStatsWholeTable)).containsOnly( - row("n_nationkey", null, null, null, null, null, null, null), - row("n_name", null, null, null, null, null, null, null), - row("n_regionkey", null, null, null, null, null, null, null), - row("n_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 25.0, null, null, null)); + row("n_nationkey", null, null, null, null, null, null), + row("n_name", null, null, null, null, null, null), + row("n_regionkey", null, null, null, null, null, null), + row("n_comment", null, null, null, null, null, null), + row(null, null, null, null, 25.0, null, null)); // column analysis onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " COMPUTE STATISTICS FOR COLUMNS"); assertThat(query(showStatsWholeTable)).containsOnly( - row("n_nationkey", null, 19.0, 0.0, null, "0", "24", null), - row("n_name", 177.0, 24.0, 0.0, null, null, null, null), - row("n_regionkey", null, 5.0, 0.0, null, "0", "4", null), - row("n_comment", 1857.0, 25.0, 0.0, null, null, null, null), - row(null, null, null, null, 25.0, null, null, null)); + row("n_nationkey", null, 19.0, 0.0, null, "0", "24"), + row("n_name", 177.0, 24.0, 0.0, null, null, null), + row("n_regionkey", null, 5.0, 0.0, null, "0", "4"), + row("n_comment", 1857.0, 25.0, 0.0, null, null, null), + row(null, null, null, null, 25.0, null, null)); } @Test @@ -221,118 +221,118 @@ public void testStatisticsForTablePartitionedByBigint() // table not analyzed assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); // basic analysis for single partition onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " PARTITION (p_regionkey = \"1\") COMPUTE STATISTICS"); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, 3.0, 0.0, null, "1", "3", null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, 3.0, 0.0, null, "1", "3"), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "1", "1", null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); // basic analysis for all partitions onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " PARTITION (p_regionkey) COMPUTE STATISTICS"); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, 3.0, 0.0, null, "1", "3", null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, 3.0, 0.0, null, "1", "3"), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "1", "1", null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "2", "2", null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "2", "2"), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 5.0, null, null)); // column analysis for single partition onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " PARTITION (p_regionkey = \"1\") COMPUTE STATISTICS FOR COLUMNS"); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 114.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", null, 3.0, 0.0, null, "1", "3", null), - row("p_comment", 1497.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 114.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 3.0, 0.0, null, "1", "3"), + row("p_comment", 1497.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 38.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "1", "1", null), - row("p_comment", 499.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "2", "2", null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "2", "2"), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 5.0, null, null)); // column analysis for all partitions onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " PARTITION (p_regionkey) COMPUTE STATISTICS FOR COLUMNS"); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 109.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", null, 3.0, 0.0, null, "1", "3", null), - row("p_comment", 1197.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 109.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 3.0, 0.0, null, "1", "3"), + row("p_comment", 1197.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 38.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "1", "1", null), - row("p_comment", 499.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, 4.0, 0.0, null, "8", "21", null), - row("p_name", 31.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "2", "2", null), - row("p_comment", 351.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 4.0, 0.0, null, "8", "21"), + row("p_name", 31.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "2", "2"), + row("p_comment", 351.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); } @Test @@ -348,118 +348,118 @@ public void testStatisticsForTablePartitionedByVarchar() // table not analyzed assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); // basic analysis for single partition onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " PARTITION (p_regionkey = \"AMERICA\") COMPUTE STATISTICS"); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", 85.0, 3.0, 0.0, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", 85.0, 3.0, 0.0, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", 35.0, 1.0, 0.0, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", 35.0, 1.0, 0.0, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); // basic analysis for all partitions onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " PARTITION (p_regionkey) COMPUTE STATISTICS"); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", 85.0, 3.0, 0.0, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", 85.0, 3.0, 0.0, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", 35.0, 1.0, 0.0, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", 35.0, 1.0, 0.0, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", 20.0, 1.0, 0.0, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", 20.0, 1.0, 0.0, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 5.0, null, null)); // column analysis for single partition onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " PARTITION (p_regionkey = \"AMERICA\") COMPUTE STATISTICS FOR COLUMNS"); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 114.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", 85.0, 3.0, 0.0, null, null, null, null), - row("p_comment", 1497.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 114.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 85.0, 3.0, 0.0, null, null, null), + row("p_comment", 1497.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 38.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", 35.0, 1.0, 0.0, null, null, null, null), - row("p_comment", 499.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 35.0, 1.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", 20.0, 1.0, 0.0, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", 20.0, 1.0, 0.0, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, 5.0, null, null)); // column analysis for all partitions onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " PARTITION (p_regionkey) COMPUTE STATISTICS FOR COLUMNS"); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 109.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", 85.0, 3.0, 0.0, null, null, null, null), - row("p_comment", 1197.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 109.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 85.0, 3.0, 0.0, null, null, null), + row("p_comment", 1197.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 38.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", 35.0, 1.0, 0.0, null, null, null, null), - row("p_comment", 499.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 35.0, 1.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, 4.0, 0.0, null, "8", "21", null), - row("p_name", 31.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", 20.0, 1.0, 0.0, null, null, null, null), - row("p_comment", 351.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 4.0, 0.0, null, "8", "21"), + row("p_name", 31.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 20.0, 1.0, 0.0, null, null, null), + row("p_comment", 351.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); } // This covers also stats calculation for unpartitioned table @@ -472,43 +472,43 @@ public void testStatisticsForAllDataTypes() onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " COMPUTE STATISTICS"); assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, null, null, null, null, null, null), - row("c_smallint", null, null, null, null, null, null, null), - row("c_int", null, null, null, null, null, null, null), - row("c_bigint", null, null, null, null, null, null, null), - row("c_float", null, null, null, null, null, null, null), - row("c_double", null, null, null, null, null, null, null), - row("c_decimal", null, null, null, null, null, null, null), - row("c_decimal_w_params", null, null, null, null, null, null, null), - row("c_timestamp", null, null, null, null, null, null, null), - row("c_date", null, null, null, null, null, null, null), - row("c_string", null, null, null, null, null, null, null), - row("c_varchar", null, null, null, null, null, null, null), - row("c_char", null, null, null, null, null, null, null), - row("c_boolean", null, null, null, null, null, null, null), - row("c_binary", null, null, null, null, null, null, null), - row(null, null, null, null, 2.0, null, null, null)); + row("c_tinyint", null, null, null, null, null, null), + row("c_smallint", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row("c_bigint", null, null, null, null, null, null), + row("c_float", null, null, null, null, null, null), + row("c_double", null, null, null, null, null, null), + row("c_decimal", null, null, null, null, null, null), + row("c_decimal_w_params", null, null, null, null, null, null), + row("c_timestamp", null, null, null, null, null, null), + row("c_date", null, null, null, null, null, null), + row("c_string", null, null, null, null, null, null), + row("c_varchar", null, null, null, null, null, null), + row("c_char", null, null, null, null, null, null), + row("c_boolean", null, null, null, null, null, null), + row("c_binary", null, null, null, null, null, null), + row(null, null, null, null, 2.0, null, null)); onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " COMPUTE STATISTICS FOR COLUMNS"); // SHOW STATS FORMAT: column_name, data_size, distinct_values_count, nulls_fraction, row_count assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, 2.0, 0.0, null, "121", "127", null), - row("c_smallint", null, 2.0, 0.0, null, "32761", "32767", null), - row("c_int", null, 2.0, 0.0, null, "2147483641", "2147483647", null), - row("c_bigint", null, 2.0, 0.0, null, "9223372036854775807", "9223372036854775807", null), - row("c_float", null, 2.0, 0.0, null, "123.341", "123.345", null), - row("c_double", null, 2.0, 0.0, null, "234.561", "235.567", null), - row("c_decimal", null, 2.0, 0.0, null, "345.0", "346.0", null), - row("c_decimal_w_params", null, 2.0, 0.0, null, "345.671", "345.678", null), - row("c_timestamp", null, 2.0, 0.0, null, null, null, null), // timestamp is shifted by hive.time-zone on read - row("c_date", null, 2.0, 0.0, null, "2015-05-09", "2015-06-10", null), - row("c_string", 22.0, 2.0, 0.0, null, null, null, null), - row("c_varchar", 20.0, 2.0, 0.0, null, null, null, null), - row("c_char", 12.0, 2.0, 0.0, null, null, null, null), - row("c_boolean", null, 2.0, 0.0, null, null, null, null), - row("c_binary", 23.0, null, 0.0, null, null, null, null), - row(null, null, null, null, 2.0, null, null, null)); + row("c_tinyint", null, 2.0, 0.0, null, "121", "127"), + row("c_smallint", null, 2.0, 0.0, null, "32761", "32767"), + row("c_int", null, 2.0, 0.0, null, "2147483641", "2147483647"), + row("c_bigint", null, 2.0, 0.0, null, "9223372036854775807", "9223372036854775807"), + row("c_float", null, 2.0, 0.0, null, "123.341", "123.345"), + row("c_double", null, 2.0, 0.0, null, "234.561", "235.567"), + row("c_decimal", null, 2.0, 0.0, null, "345.0", "346.0"), + row("c_decimal_w_params", null, 2.0, 0.0, null, "345.671", "345.678"), + row("c_timestamp", null, 2.0, 0.0, null, null, null), // timestamp is shifted by hive.time-zone on read + row("c_date", null, 2.0, 0.0, null, "2015-05-09", "2015-06-10"), + row("c_string", 22.0, 2.0, 0.0, null, null, null), + row("c_varchar", 20.0, 2.0, 0.0, null, null, null), + row("c_char", 12.0, 2.0, 0.0, null, null, null), + row("c_boolean", null, 2.0, 0.0, null, null, null), + row("c_binary", 23.0, null, 0.0, null, null, null), + row(null, null, null, null, 2.0, null, null)); } @Test(groups = {SKIP_ON_CDH}) // skip on cdh due to no support for date column and stats @@ -520,42 +520,42 @@ public void testStatisticsForAllDataTypesNoData() onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " COMPUTE STATISTICS"); assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, null, null, null, null, null, null), - row("c_smallint", null, null, null, null, null, null, null), - row("c_int", null, null, null, null, null, null, null), - row("c_bigint", null, null, null, null, null, null, null), - row("c_float", null, null, null, null, null, null, null), - row("c_double", null, null, null, null, null, null, null), - row("c_decimal", null, null, null, null, null, null, null), - row("c_decimal_w_params", null, null, null, null, null, null, null), - row("c_timestamp", null, null, null, null, null, null, null), - row("c_date", null, null, null, null, null, null, null), - row("c_string", null, null, null, null, null, null, null), - row("c_varchar", null, null, null, null, null, null, null), - row("c_char", null, null, null, null, null, null, null), - row("c_boolean", null, null, null, null, null, null, null), - row("c_binary", null, null, null, null, null, null, null), - row(null, null, null, null, 0.0, null, null, null)); + row("c_tinyint", null, null, null, null, null, null), + row("c_smallint", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row("c_bigint", null, null, null, null, null, null), + row("c_float", null, null, null, null, null, null), + row("c_double", null, null, null, null, null, null), + row("c_decimal", null, null, null, null, null, null), + row("c_decimal_w_params", null, null, null, null, null, null), + row("c_timestamp", null, null, null, null, null, null), + row("c_date", null, null, null, null, null, null), + row("c_string", null, null, null, null, null, null), + row("c_varchar", null, null, null, null, null, null), + row("c_char", null, null, null, null, null, null), + row("c_boolean", null, null, null, null, null, null), + row("c_binary", null, null, null, null, null, null), + row(null, null, null, null, 0.0, null, null)); onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " COMPUTE STATISTICS FOR COLUMNS"); assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, 0.0, 0.0, null, null, null, null), - row("c_smallint", null, 0.0, 0.0, null, null, null, null), - row("c_int", null, 0.0, 0.0, null, null, null, null), - row("c_bigint", null, 0.0, 0.0, null, null, null, null), - row("c_float", null, 0.0, 0.0, null, null, null, null), - row("c_double", null, 0.0, 0.0, null, null, null, null), - row("c_decimal", null, 0.0, 0.0, null, null, null, null), - row("c_decimal_w_params", null, 0.0, 0.0, null, null, null, null), - row("c_timestamp", null, 0.0, 0.0, null, null, null, null), - row("c_date", null, 0.0, 0.0, null, null, null, null), - row("c_string", 0.0, 0.0, 0.0, null, null, null, null), - row("c_varchar", 0.0, 0.0, 0.0, null, null, null, null), - row("c_char", 0.0, 0.0, 0.0, null, null, null, null), - row("c_boolean", null, 0.0, 0.0, null, null, null, null), - row("c_binary", 0.0, null, 0.0, null, null, null, null), - row(null, null, null, null, 0.0, null, null, null)); + row("c_tinyint", null, 0.0, 0.0, null, null, null), + row("c_smallint", null, 0.0, 0.0, null, null, null), + row("c_int", null, 0.0, 0.0, null, null, null), + row("c_bigint", null, 0.0, 0.0, null, null, null), + row("c_float", null, 0.0, 0.0, null, null, null), + row("c_double", null, 0.0, 0.0, null, null, null), + row("c_decimal", null, 0.0, 0.0, null, null, null), + row("c_decimal_w_params", null, 0.0, 0.0, null, null, null), + row("c_timestamp", null, 0.0, 0.0, null, null, null), + row("c_date", null, 0.0, 0.0, null, null, null), + row("c_string", 0.0, 0.0, 0.0, null, null, null), + row("c_varchar", 0.0, 0.0, 0.0, null, null, null), + row("c_char", 0.0, 0.0, 0.0, null, null, null), + row("c_boolean", null, 0.0, 0.0, null, null, null), + row("c_binary", 0.0, null, 0.0, null, null, null), + row(null, null, null, null, 0.0, null, null)); } @Test(groups = {SKIP_ON_CDH}) // skip on cdh due to no support for date column and stats @@ -568,42 +568,42 @@ public void testStatisticsForAllDataTypesOnlyNulls() onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " COMPUTE STATISTICS"); assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, null, null, null, null, null, null), - row("c_smallint", null, null, null, null, null, null, null), - row("c_int", null, null, null, null, null, null, null), - row("c_bigint", null, null, null, null, null, null, null), - row("c_float", null, null, null, null, null, null, null), - row("c_double", null, null, null, null, null, null, null), - row("c_decimal", null, null, null, null, null, null, null), - row("c_decimal_w_params", null, null, null, null, null, null, null), - row("c_timestamp", null, null, null, null, null, null, null), - row("c_date", null, null, null, null, null, null, null), - row("c_string", null, null, null, null, null, null, null), - row("c_varchar", null, null, null, null, null, null, null), - row("c_char", null, null, null, null, null, null, null), - row("c_boolean", null, null, null, null, null, null, null), - row("c_binary", null, null, null, null, null, null, null), - row(null, null, null, null, 1.0, null, null, null)); + row("c_tinyint", null, null, null, null, null, null), + row("c_smallint", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row("c_bigint", null, null, null, null, null, null), + row("c_float", null, null, null, null, null, null), + row("c_double", null, null, null, null, null, null), + row("c_decimal", null, null, null, null, null, null), + row("c_decimal_w_params", null, null, null, null, null, null), + row("c_timestamp", null, null, null, null, null, null), + row("c_date", null, null, null, null, null, null), + row("c_string", null, null, null, null, null, null), + row("c_varchar", null, null, null, null, null, null), + row("c_char", null, null, null, null, null, null), + row("c_boolean", null, null, null, null, null, null), + row("c_binary", null, null, null, null, null, null), + row(null, null, null, null, 1.0, null, null)); onHive().executeQuery("ANALYZE TABLE " + tableNameInDatabase + " COMPUTE STATISTICS FOR COLUMNS"); assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, 0.0, 1.0, null, null, null, null), - row("c_smallint", null, 0.0, 1.0, null, null, null, null), - row("c_int", null, 0.0, 1.0, null, null, null, null), - row("c_bigint", null, 0.0, 1.0, null, null, null, null), - row("c_float", null, 0.0, 1.0, null, null, null, null), - row("c_double", null, 0.0, 1.0, null, null, null, null), - row("c_decimal", null, 0.0, 1.0, null, null, null, null), - row("c_decimal_w_params", null, 0.0, 1.0, null, null, null, null), - row("c_timestamp", null, 0.0, 1.0, null, null, null, null), - row("c_date", null, 0.0, 1.0, null, null, null, null), - row("c_string", 0.0, 0.0, 1.0, null, null, null, null), - row("c_varchar", 0.0, 0.0, 1.0, null, null, null, null), - row("c_char", 0.0, 0.0, 1.0, null, null, null, null), - row("c_boolean", null, 0.0, 1.0, null, null, null, null), - row("c_binary", 0.0, null, 1.0, null, null, null, null), - row(null, null, null, null, 1.0, null, null, null)); + row("c_tinyint", null, 0.0, 1.0, null, null, null), + row("c_smallint", null, 0.0, 1.0, null, null, null), + row("c_int", null, 0.0, 1.0, null, null, null), + row("c_bigint", null, 0.0, 1.0, null, null, null), + row("c_float", null, 0.0, 1.0, null, null, null), + row("c_double", null, 0.0, 1.0, null, null, null), + row("c_decimal", null, 0.0, 1.0, null, null, null), + row("c_decimal_w_params", null, 0.0, 1.0, null, null, null), + row("c_timestamp", null, 0.0, 1.0, null, null, null), + row("c_date", null, 0.0, 1.0, null, null, null), + row("c_string", 0.0, 0.0, 1.0, null, null, null), + row("c_varchar", 0.0, 0.0, 1.0, null, null, null), + row("c_char", 0.0, 0.0, 1.0, null, null, null), + row("c_boolean", null, 0.0, 1.0, null, null, null), + row("c_binary", 0.0, null, 1.0, null, null, null), + row(null, null, null, null, 1.0, null, null)); } @Test @@ -616,22 +616,22 @@ public void testStatisticsForSkewedTable() onHive().executeQuery("INSERT INTO TABLE " + tableName + " VALUES ('c1', 1), ('c1', 2)"); assertThat(query("SHOW STATS FOR " + tableName)).containsOnly( - row("c_string", null, null, null, null, null, null, null), - row("c_int", null, null, null, null, null, null, null), - row(null, null, null, null, 2.0, null, null, null)); + row("c_string", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row(null, null, null, null, 2.0, null, null)); onHive().executeQuery("ANALYZE TABLE " + tableName + " COMPUTE STATISTICS"); assertThat(query("SHOW STATS FOR " + tableName)).containsOnly( - row("c_string", null, null, null, null, null, null, null), - row("c_int", null, null, null, null, null, null, null), - row(null, null, null, null, 2.0, null, null, null)); + row("c_string", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row(null, null, null, null, 2.0, null, null)); onHive().executeQuery("ANALYZE TABLE " + tableName + " COMPUTE STATISTICS FOR COLUMNS"); assertThat(query("SHOW STATS FOR " + tableName)).containsOnly( - row("c_string", 4.0, 1.0, 0.0, null, null, null, null), - row("c_int", null, 2.0, 0.0, null, "1", "2", null), - row(null, null, null, null, 2.0, null, null, null)); + row("c_string", 4.0, 1.0, 0.0, null, null, null), + row("c_int", null, 2.0, 0.0, null, "1", "2"), + row(null, null, null, null, 2.0, null, null)); } @Test @@ -644,15 +644,15 @@ public void testAnalyzesForSkewedTable() onHive().executeQuery("INSERT INTO TABLE " + tableName + " VALUES ('c1', 1), ('c1', 2)"); assertThat(query("SHOW STATS FOR " + tableName)).containsOnly( - row("c_string", null, null, null, null, null, null, null), - row("c_int", null, null, null, null, null, null, null), - row(null, null, null, null, 2.0, null, null, null)); + row("c_string", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row(null, null, null, null, 2.0, null, null)); assertThat(query("ANALYZE " + tableName)).containsExactly(row(2)); assertThat(query("SHOW STATS FOR " + tableName)).containsOnly( - row("c_string", 4.0, 1.0, 0.0, null, null, null, null), - row("c_int", null, 2.0, 0.0, null, "1", "2", null), - row(null, null, null, null, 2.0, null, null, null)); + row("c_string", 4.0, 1.0, 0.0, null, null, null), + row("c_int", null, 2.0, 0.0, null, "1", "2"), + row(null, null, null, null, 2.0, null, null)); } @Test @@ -665,20 +665,20 @@ public void testAnalyzeForUnpartitionedTable() // table not analyzed assertThat(query(showStatsWholeTable)).containsOnly( - row("n_nationkey", null, null, anyOf(null, 0.0), null, null, null, null), - row("n_name", null, null, anyOf(null, 0.0), null, null, null, null), - row("n_regionkey", null, null, anyOf(null, 0.0), null, null, null, null), - row("n_comment", null, null, anyOf(null, 0.0), null, null, null, null), - row(null, null, null, null, anyOf(null, 0.0), null, null, null)); // anyOf because of different behaviour on HDP (hive 1.2) and CDH (hive 1.1) + row("n_nationkey", null, null, anyOf(null, 0.0), null, null, null), + row("n_name", null, null, anyOf(null, 0.0), null, null, null), + row("n_regionkey", null, null, anyOf(null, 0.0), null, null, null), + row("n_comment", null, null, anyOf(null, 0.0), null, null, null), + row(null, null, null, null, anyOf(null, 0.0), null, null)); // anyOf because of different behaviour on HDP (hive 1.2) and CDH (hive 1.1) assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(25)); assertThat(query(showStatsWholeTable)).containsOnly( - row("n_nationkey", null, 25.0, 0.0, null, "0", "24", null), - row("n_name", 177.0, 25.0, 0.0, null, null, null, null), - row("n_regionkey", null, 5.0, 0.0, null, "0", "4", null), - row("n_comment", 1857.0, 25.0, 0.0, null, null, null, null), - row(null, null, null, null, 25.0, null, null, null)); + row("n_nationkey", null, 25.0, 0.0, null, "0", "24"), + row("n_name", 177.0, 25.0, 0.0, null, null, null), + row("n_regionkey", null, 5.0, 0.0, null, "0", "4"), + row("n_comment", 1857.0, 25.0, 0.0, null, null, null), + row(null, null, null, null, 25.0, null, null)); } @Test @@ -693,10 +693,10 @@ public void testAnalyzeForTableWithNonPrimitiveTypes() assertThat(query("ANALYZE " + tableName)).containsExactly(row(1)); assertThat(query(showStatsTable)).containsOnly( - row("c_row", null, null, null, null, null, null, null), - row("c_char", 1.0, 1.0, 0.0, null, null, null, null), - row("c_int", null, 1.0, 0.0, null, "3", "3", null), - row(null, null, null, null, 1.0, null, null, null)); + row("c_row", null, null, null, null, null, null), + row("c_char", 1.0, 1.0, 0.0, null, null, null), + row("c_int", null, 1.0, 0.0, null, "3", "3"), + row(null, null, null, null, 1.0, null, null)); } @Test @@ -715,11 +715,11 @@ public void testAnalyzeForPartitionedTableWithNonPrimitiveTypes() assertThat(query("ANALYZE " + tableName)).containsExactly(row(3)); assertThat(query(showStatsTable)).containsOnly( - row("c_row", null, null, null, null, null, null, null), - row("c_char", 3.0, 2.0, 0.0, null, null, null, null), - row("c_int", null, 1.0, 0.0, null, "3", "5", null), - row("c_part", 3.0, 2.0, 0.0, null, null, null, null), - row(null, null, null, null, 3.0, null, null, null)); + row("c_row", null, null, null, null, null, null), + row("c_char", 3.0, 2.0, 0.0, null, null, null), + row("c_int", null, 1.0, 0.0, null, "3", "5"), + row("c_part", 3.0, 2.0, 0.0, null, null, null), + row(null, null, null, null, 3.0, null, null)); } @Test @@ -735,68 +735,68 @@ public void testAnalyzeForTablePartitionedByBigint() // table not analyzed assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); // analyze for single partition assertThat(query("ANALYZE " + tableNameInDatabase + " WITH (partitions = ARRAY[ARRAY['1']])")).containsExactly(row(5)); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 114.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", null, 3.0, 0.0, null, "1", "3", null), - row("p_comment", 1497.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 114.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 3.0, 0.0, null, "1", "3"), + row("p_comment", 1497.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 38.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "1", "1", null), - row("p_comment", 499.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); // analyze for all partitions assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(15)); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 109.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", null, 3.0, 0.0, null, "1", "3", null), - row("p_comment", 1197.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 109.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 3.0, 0.0, null, "1", "3"), + row("p_comment", 1197.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 38.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "1", "1", null), - row("p_comment", 499.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "8", "21", null), - row("p_name", 31.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", null, 1.0, 0.0, null, "2", "2", null), - row("p_comment", 351.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "8", "21"), + row("p_name", 31.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "2", "2"), + row("p_comment", 351.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); } @Test @@ -812,68 +812,68 @@ public void testAnalyzeForTablePartitionedByVarchar() // table not analyzed assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); // analyze for single partition assertThat(query("ANALYZE " + tableNameInDatabase + " WITH (partitions = ARRAY[ARRAY['AMERICA']])")).containsExactly(row(5)); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 114.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", 85.0, 3.0, 0.0, null, null, null, null), - row("p_comment", 1497.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 114.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 85.0, 3.0, 0.0, null, null, null), + row("p_comment", 1497.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 38.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", 35.0, 1.0, 0.0, null, null, null, null), - row("p_comment", 499.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 35.0, 1.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, null, null, null, null, null, null), - row("p_name", null, null, null, null, null, null, null), - row("p_regionkey", null, null, null, null, null, null, null), - row("p_comment", null, null, null, null, null, null, null), - row(null, null, null, null, null, null, null, null)); + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); // column analysis for all partitions assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(15)); assertThat(query(showStatsWholeTable)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 109.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", 85.0, 3.0, 0.0, null, null, null, null), - row("p_comment", 1197.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 15.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 109.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 85.0, 3.0, 0.0, null, null, null), + row("p_comment", 1197.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); assertThat(query(showStatsPartitionOne)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "1", "24", null), - row("p_name", 38.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", 35.0, 1.0, 0.0, null, null, null, null), - row("p_comment", 499.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 35.0, 1.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); assertThat(query(showStatsPartitionTwo)).containsOnly( - row("p_nationkey", null, 5.0, 0.0, null, "8", "21", null), - row("p_name", 31.0, 5.0, 0.0, null, null, null, null), - row("p_regionkey", 20.0, 1.0, 0.0, null, null, null, null), - row("p_comment", 351.0, 5.0, 0.0, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null)); + row("p_nationkey", null, 5.0, 0.0, null, "8", "21"), + row("p_name", 31.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 20.0, 1.0, 0.0, null, null, null), + row("p_comment", 351.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); } // This covers also stats calculation for unpartitioned table @@ -884,43 +884,43 @@ public void testAnalyzeForAllDataTypes() String tableNameInDatabase = mutableTablesState().get(ALL_TYPES_TABLE_NAME).getNameInDatabase(); assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, null, null, null, null, null, null), - row("c_smallint", null, null, null, null, null, null, null), - row("c_int", null, null, null, null, null, null, null), - row("c_bigint", null, null, null, null, null, null, null), - row("c_float", null, null, null, null, null, null, null), - row("c_double", null, null, null, null, null, null, null), - row("c_decimal", null, null, null, null, null, null, null), - row("c_decimal_w_params", null, null, null, null, null, null, null), - row("c_timestamp", null, null, null, null, null, null, null), - row("c_date", null, null, null, null, null, null, null), - row("c_string", null, null, null, null, null, null, null), - row("c_varchar", null, null, null, null, null, null, null), - row("c_char", null, null, null, null, null, null, null), - row("c_boolean", null, null, null, null, null, null, null), - row("c_binary", null, null, null, null, null, null, null), - row(null, null, null, null, 0.0, null, null, null)); + row("c_tinyint", null, null, null, null, null, null), + row("c_smallint", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row("c_bigint", null, null, null, null, null, null), + row("c_float", null, null, null, null, null, null), + row("c_double", null, null, null, null, null, null), + row("c_decimal", null, null, null, null, null, null), + row("c_decimal_w_params", null, null, null, null, null, null), + row("c_timestamp", null, null, null, null, null, null), + row("c_date", null, null, null, null, null, null), + row("c_string", null, null, null, null, null, null), + row("c_varchar", null, null, null, null, null, null), + row("c_char", null, null, null, null, null, null), + row("c_boolean", null, null, null, null, null, null), + row("c_binary", null, null, null, null, null, null), + row(null, null, null, null, 0.0, null, null)); assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(2)); // SHOW STATS FORMAT: column_name, data_size, distinct_values_count, nulls_fraction, row_count assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, 2.0, 0.0, null, "121", "127", null), - row("c_smallint", null, 2.0, 0.0, null, "32761", "32767", null), - row("c_int", null, 2.0, 0.0, null, "2147483641", "2147483647", null), - row("c_bigint", null, 2.0, 0.0, null, "9223372036854775807", "9223372036854775807", null), - row("c_float", null, 2.0, 0.0, null, "123.341", "123.345", null), - row("c_double", null, 2.0, 0.0, null, "234.561", "235.567", null), - row("c_decimal", null, 2.0, 0.0, null, "345.0", "346.0", null), - row("c_decimal_w_params", null, 2.0, 0.0, null, "345.671", "345.678", null), - row("c_timestamp", null, 2.0, 0.0, null, null, null, null), - row("c_date", null, 2.0, 0.0, null, "2015-05-09", "2015-06-10", null), - row("c_string", 22.0, 2.0, 0.0, null, null, null, null), - row("c_varchar", 20.0, 2.0, 0.0, null, null, null, null), - row("c_char", 12.0, 2.0, 0.0, null, null, null, null), - row("c_boolean", null, 2.0, 0.0, null, null, null, null), - row("c_binary", 23.0, null, 0.0, null, null, null, null), - row(null, null, null, null, 2.0, null, null, null)); + row("c_tinyint", null, 2.0, 0.0, null, "121", "127"), + row("c_smallint", null, 2.0, 0.0, null, "32761", "32767"), + row("c_int", null, 2.0, 0.0, null, "2147483641", "2147483647"), + row("c_bigint", null, 2.0, 0.0, null, "9223372036854775807", "9223372036854775807"), + row("c_float", null, 2.0, 0.0, null, "123.341", "123.345"), + row("c_double", null, 2.0, 0.0, null, "234.561", "235.567"), + row("c_decimal", null, 2.0, 0.0, null, "345.0", "346.0"), + row("c_decimal_w_params", null, 2.0, 0.0, null, "345.671", "345.678"), + row("c_timestamp", null, 2.0, 0.0, null, null, null), + row("c_date", null, 2.0, 0.0, null, "2015-05-09", "2015-06-10"), + row("c_string", 22.0, 2.0, 0.0, null, null, null), + row("c_varchar", 20.0, 2.0, 0.0, null, null, null), + row("c_char", 12.0, 2.0, 0.0, null, null, null), + row("c_boolean", null, 2.0, 0.0, null, null, null), + row("c_binary", 23.0, null, 0.0, null, null, null), + row(null, null, null, null, 2.0, null, null)); } @Test(groups = {SKIP_ON_CDH}) // skip on cdh due to no support for date column and stats @@ -930,42 +930,42 @@ public void testAnalyzeForAllDataTypesNoData() String tableNameInDatabase = mutableTablesState().get(EMPTY_ALL_TYPES_TABLE_NAME).getNameInDatabase(); assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, null, null, null, null, null, null), - row("c_smallint", null, null, null, null, null, null, null), - row("c_int", null, null, null, null, null, null, null), - row("c_bigint", null, null, null, null, null, null, null), - row("c_float", null, null, null, null, null, null, null), - row("c_double", null, null, null, null, null, null, null), - row("c_decimal", null, null, null, null, null, null, null), - row("c_decimal_w_params", null, null, null, null, null, null, null), - row("c_timestamp", null, null, null, null, null, null, null), - row("c_date", null, null, null, null, null, null, null), - row("c_string", null, null, null, null, null, null, null), - row("c_varchar", null, null, null, null, null, null, null), - row("c_char", null, null, null, null, null, null, null), - row("c_boolean", null, null, null, null, null, null, null), - row("c_binary", null, null, null, null, null, null, null), - row(null, null, null, null, 0.0, null, null, null)); + row("c_tinyint", null, null, null, null, null, null), + row("c_smallint", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row("c_bigint", null, null, null, null, null, null), + row("c_float", null, null, null, null, null, null), + row("c_double", null, null, null, null, null, null), + row("c_decimal", null, null, null, null, null, null), + row("c_decimal_w_params", null, null, null, null, null, null), + row("c_timestamp", null, null, null, null, null, null), + row("c_date", null, null, null, null, null, null), + row("c_string", null, null, null, null, null, null), + row("c_varchar", null, null, null, null, null, null), + row("c_char", null, null, null, null, null, null), + row("c_boolean", null, null, null, null, null, null), + row("c_binary", null, null, null, null, null, null), + row(null, null, null, null, 0.0, null, null)); assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(0)); assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, 0.0, 0.0, null, null, null, null), - row("c_smallint", null, 0.0, 0.0, null, null, null, null), - row("c_int", null, 0.0, 0.0, null, null, null, null), - row("c_bigint", null, 0.0, 0.0, null, null, null, null), - row("c_float", null, 0.0, 0.0, null, null, null, null), - row("c_double", null, 0.0, 0.0, null, null, null, null), - row("c_decimal", null, 0.0, 0.0, null, null, null, null), - row("c_decimal_w_params", null, 0.0, 0.0, null, null, null, null), - row("c_timestamp", null, 0.0, 0.0, null, null, null, null), - row("c_date", null, 0.0, 0.0, null, null, null, null), - row("c_string", 0.0, 0.0, 0.0, null, null, null, null), - row("c_varchar", 0.0, 0.0, 0.0, null, null, null, null), - row("c_char", 0.0, 0.0, 0.0, null, null, null, null), - row("c_boolean", null, 0.0, 0.0, null, null, null, null), - row("c_binary", 0.0, null, 0.0, null, null, null, null), - row(null, null, null, null, 0.0, null, null, null)); + row("c_tinyint", null, 0.0, 0.0, null, null, null), + row("c_smallint", null, 0.0, 0.0, null, null, null), + row("c_int", null, 0.0, 0.0, null, null, null), + row("c_bigint", null, 0.0, 0.0, null, null, null), + row("c_float", null, 0.0, 0.0, null, null, null), + row("c_double", null, 0.0, 0.0, null, null, null), + row("c_decimal", null, 0.0, 0.0, null, null, null), + row("c_decimal_w_params", null, 0.0, 0.0, null, null, null), + row("c_timestamp", null, 0.0, 0.0, null, null, null), + row("c_date", null, 0.0, 0.0, null, null, null), + row("c_string", 0.0, 0.0, 0.0, null, null, null), + row("c_varchar", 0.0, 0.0, 0.0, null, null, null), + row("c_char", 0.0, 0.0, 0.0, null, null, null), + row("c_boolean", null, 0.0, 0.0, null, null, null), + row("c_binary", 0.0, null, 0.0, null, null, null), + row(null, null, null, null, 0.0, null, null)); } @Test(groups = {SKIP_ON_CDH}) // skip on cdh due to no support for date column and stats @@ -978,42 +978,42 @@ public void testAnalyzeForAllDataTypesOnlyNulls() onHive().executeQuery("INSERT INTO TABLE " + tableNameInDatabase + " VALUES(null, null, null, null, null, null, null, null, null, null, null, null, null, null, null)"); assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, null, null, null, null, null, null), - row("c_smallint", null, null, null, null, null, null, null), - row("c_int", null, null, null, null, null, null, null), - row("c_bigint", null, null, null, null, null, null, null), - row("c_float", null, null, null, null, null, null, null), - row("c_double", null, null, null, null, null, null, null), - row("c_decimal", null, null, null, null, null, null, null), - row("c_decimal_w_params", null, null, null, null, null, null, null), - row("c_timestamp", null, null, null, null, null, null, null), - row("c_date", null, null, null, null, null, null, null), - row("c_string", null, null, null, null, null, null, null), - row("c_varchar", null, null, null, null, null, null, null), - row("c_char", null, null, null, null, null, null, null), - row("c_boolean", null, null, null, null, null, null, null), - row("c_binary", null, null, null, null, null, null, null), - row(null, null, null, null, 1.0, null, null, null)); + row("c_tinyint", null, null, null, null, null, null), + row("c_smallint", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row("c_bigint", null, null, null, null, null, null), + row("c_float", null, null, null, null, null, null), + row("c_double", null, null, null, null, null, null), + row("c_decimal", null, null, null, null, null, null), + row("c_decimal_w_params", null, null, null, null, null, null), + row("c_timestamp", null, null, null, null, null, null), + row("c_date", null, null, null, null, null, null), + row("c_string", null, null, null, null, null, null), + row("c_varchar", null, null, null, null, null, null), + row("c_char", null, null, null, null, null, null), + row("c_boolean", null, null, null, null, null, null), + row("c_binary", null, null, null, null, null, null), + row(null, null, null, null, 1.0, null, null)); assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(1)); assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( - row("c_tinyint", null, 0.0, 1.0, null, null, null, null), - row("c_smallint", null, 0.0, 1.0, null, null, null, null), - row("c_int", null, 0.0, 1.0, null, null, null, null), - row("c_bigint", null, 0.0, 1.0, null, null, null, null), - row("c_float", null, 0.0, 1.0, null, null, null, null), - row("c_double", null, 0.0, 1.0, null, null, null, null), - row("c_decimal", null, 0.0, 1.0, null, null, null, null), - row("c_decimal_w_params", null, 0.0, 1.0, null, null, null, null), - row("c_timestamp", null, 0.0, 1.0, null, null, null, null), - row("c_date", null, 0.0, 1.0, null, null, null, null), - row("c_string", 0.0, 0.0, 1.0, null, null, null, null), - row("c_varchar", 0.0, 0.0, 1.0, null, null, null, null), - row("c_char", 0.0, 0.0, 1.0, null, null, null, null), - row("c_boolean", null, 0.0, 1.0, null, null, null, null), - row("c_binary", 0.0, null, 1.0, null, null, null, null), - row(null, null, null, null, 1.0, null, null, null)); + row("c_tinyint", null, 0.0, 1.0, null, null, null), + row("c_smallint", null, 0.0, 1.0, null, null, null), + row("c_int", null, 0.0, 1.0, null, null, null), + row("c_bigint", null, 0.0, 1.0, null, null, null), + row("c_float", null, 0.0, 1.0, null, null, null), + row("c_double", null, 0.0, 1.0, null, null, null), + row("c_decimal", null, 0.0, 1.0, null, null, null), + row("c_decimal_w_params", null, 0.0, 1.0, null, null, null), + row("c_timestamp", null, 0.0, 1.0, null, null, null), + row("c_date", null, 0.0, 1.0, null, null, null), + row("c_string", 0.0, 0.0, 1.0, null, null, null), + row("c_varchar", 0.0, 0.0, 1.0, null, null, null), + row("c_char", 0.0, 0.0, 1.0, null, null, null), + row("c_boolean", null, 0.0, 1.0, null, null, null), + row("c_binary", 0.0, null, 1.0, null, null, null), + row(null, null, null, null, 1.0, null, null)); } @Test @@ -1049,22 +1049,22 @@ public void testComputeTableStatisticsOnInsert() query(format("INSERT INTO %s SELECT * FROM %s", tableName, allTypesAllNullTable)); query(format("INSERT INTO %s SELECT * FROM %s", tableName, allTypesAllNullTable)); assertThat(query("SHOW STATS FOR " + tableName)).containsOnly( - row("c_tinyint", null, 2.0, 0.5, null, "121", "127", null), - row("c_smallint", null, 2.0, 0.5, null, "32761", "32767", null), - row("c_int", null, 2.0, 0.5, null, "2147483641", "2147483647", null), - row("c_bigint", null, 2.0, 0.5, null, "9223372036854775807", "9223372036854775807", null), - row("c_float", null, 2.0, 0.5, null, "123.341", "123.345", null), - row("c_double", null, 2.0, 0.5, null, "234.561", "235.567", null), - row("c_decimal", null, 2.0, 0.5, null, "345.0", "346.0", null), - row("c_decimal_w_params", null, 2.0, 0.5, null, "345.671", "345.678", null), - row("c_timestamp", null, 2.0, 0.5, null, null, null, null), - row("c_date", null, 2.0, 0.5, null, "2015-05-09", "2015-06-10", null), - row("c_string", 22.0, 2.0, 0.5, null, null, null, null), - row("c_varchar", 20.0, 2.0, 0.5, null, null, null, null), - row("c_char", 12.0, 2.0, 0.5, null, null, null, null), - row("c_boolean", null, 2.0, 0.5, null, null, null, null), - row("c_binary", 23.0, null, 0.5, null, null, null, null), - row(null, null, null, null, 4.0, null, null, null)); + row("c_tinyint", null, 2.0, 0.5, null, "121", "127"), + row("c_smallint", null, 2.0, 0.5, null, "32761", "32767"), + row("c_int", null, 2.0, 0.5, null, "2147483641", "2147483647"), + row("c_bigint", null, 2.0, 0.5, null, "9223372036854775807", "9223372036854775807"), + row("c_float", null, 2.0, 0.5, null, "123.341", "123.345"), + row("c_double", null, 2.0, 0.5, null, "234.561", "235.567"), + row("c_decimal", null, 2.0, 0.5, null, "345.0", "346.0"), + row("c_decimal_w_params", null, 2.0, 0.5, null, "345.671", "345.678"), + row("c_timestamp", null, 2.0, 0.5, null, null, null), + row("c_date", null, 2.0, 0.5, null, "2015-05-09", "2015-06-10"), + row("c_string", 22.0, 2.0, 0.5, null, null, null), + row("c_varchar", 20.0, 2.0, 0.5, null, null, null), + row("c_char", 12.0, 2.0, 0.5, null, null, null), + row("c_boolean", null, 2.0, 0.5, null, null, null), + row("c_binary", 23.0, null, 0.5, null, null, null), + row(null, null, null, null, 4.0, null, null)); query(format("INSERT INTO %s VALUES( " + "TINYINT '120', " + @@ -1084,22 +1084,22 @@ public void testComputeTableStatisticsOnInsert() "CAST('cGllcyBiaW5hcm54' as VARBINARY))", tableName)); assertThat(query("SHOW STATS FOR " + tableName)).containsOnly(ImmutableList.of( - row("c_tinyint", null, 2.0, 0.4, null, "120", "127", null), - row("c_smallint", null, 2.0, 0.4, null, "32760", "32767", null), - row("c_int", null, 2.0, 0.4, null, "2147483640", "2147483647", null), - row("c_bigint", null, 2.0, 0.4, null, "9223372036854775807", "9223372036854775807", null), - row("c_float", null, 2.0, 0.4, null, "123.34", "123.345", null), - row("c_double", null, 2.0, 0.4, null, "234.56", "235.567", null), - row("c_decimal", null, 2.0, 0.4, null, "343.0", "346.0", null), - row("c_decimal_w_params", null, 2.0, 0.4, null, "345.67", "345.678", null), - row("c_timestamp", null, 2.0, 0.4, null, null, null, null), - row("c_date", null, 2.0, 0.4, null, "2015-05-08", "2015-06-10", null), - row("c_string", 32.0, 2.0, 0.4, null, null, null, null), - row("c_varchar", 29.0, 2.0, 0.4, null, null, null, null), - row("c_char", 17.0, 2.0, 0.4, null, null, null, null), - row("c_boolean", null, 2.0, 0.4, null, null, null, null), - row("c_binary", 39.0, null, 0.4, null, null, null, null), - row(null, null, null, null, 5.0, null, null, null))); + row("c_tinyint", null, 2.0, 0.4, null, "120", "127"), + row("c_smallint", null, 2.0, 0.4, null, "32760", "32767"), + row("c_int", null, 2.0, 0.4, null, "2147483640", "2147483647"), + row("c_bigint", null, 2.0, 0.4, null, "9223372036854775807", "9223372036854775807"), + row("c_float", null, 2.0, 0.4, null, "123.34", "123.345"), + row("c_double", null, 2.0, 0.4, null, "234.56", "235.567"), + row("c_decimal", null, 2.0, 0.4, null, "343.0", "346.0"), + row("c_decimal_w_params", null, 2.0, 0.4, null, "345.67", "345.678"), + row("c_timestamp", null, 2.0, 0.4, null, null, null), + row("c_date", null, 2.0, 0.4, null, "2015-05-08", "2015-06-10"), + row("c_string", 32.0, 2.0, 0.4, null, null, null), + row("c_varchar", 29.0, 2.0, 0.4, null, null, null), + row("c_char", 17.0, 2.0, 0.4, null, null, null), + row("c_boolean", null, 2.0, 0.4, null, null, null), + row("c_binary", 39.0, null, 0.4, null, null, null), + row(null, null, null, null, 5.0, null, null))); } finally { query(format("DROP TABLE IF EXISTS %s", tableName)); @@ -1160,44 +1160,44 @@ public void testComputePartitionStatisticsOnCreateTable() ") AS t (c_tinyint, c_smallint, c_int, c_bigint, c_float, c_double, c_decimal, c_decimal_w_params, c_timestamp, c_date, c_string, c_varchar, c_char, c_boolean, c_binary, p_bigint, p_varchar)", tableName)); assertThat(query(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_bigint = 1 AND p_varchar = 'partition1')", tableName))).containsOnly(ImmutableList.of( - row("c_tinyint", null, 1.0, 0.5, null, "120", "120", null), - row("c_smallint", null, 1.0, 0.5, null, "32760", "32760", null), - row("c_int", null, 1.0, 0.5, null, "2147483640", "2147483640", null), - row("c_bigint", null, 1.0, 0.5, null, "9223372036854775807", "9223372036854775807", null), - row("c_float", null, 1.0, 0.5, null, "123.34", "123.34", null), - row("c_double", null, 1.0, 0.5, null, "234.56", "234.56", null), - row("c_decimal", null, 1.0, 0.5, null, "343.0", "343.0", null), - row("c_decimal_w_params", null, 1.0, 0.5, null, "345.67", "345.67", null), - row("c_timestamp", null, 1.0, 0.5, null, null, null, null), - row("c_date", null, 1.0, 0.5, null, "2015-05-08", "2015-05-08", null), - row("c_string", 10.0, 1.0, 0.5, null, null, null, null), - row("c_varchar", 10.0, 1.0, 0.5, null, null, null, null), - row("c_char", 9.0, 1.0, 0.5, null, null, null, null), - row("c_boolean", null, 1.0, 0.5, null, null, null, null), - row("c_binary", 9.0, null, 0.5, null, null, null, null), - row("p_bigint", null, 1.0, 0.0, null, "1", "1", null), - row("p_varchar", 20.0, 1.0, 0.0, null, null, null, null), - row(null, null, null, null, 2.0, null, null, null))); + row("c_tinyint", null, 1.0, 0.5, null, "120", "120"), + row("c_smallint", null, 1.0, 0.5, null, "32760", "32760"), + row("c_int", null, 1.0, 0.5, null, "2147483640", "2147483640"), + row("c_bigint", null, 1.0, 0.5, null, "9223372036854775807", "9223372036854775807"), + row("c_float", null, 1.0, 0.5, null, "123.34", "123.34"), + row("c_double", null, 1.0, 0.5, null, "234.56", "234.56"), + row("c_decimal", null, 1.0, 0.5, null, "343.0", "343.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "345.67", "345.67"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), + row("c_date", null, 1.0, 0.5, null, "2015-05-08", "2015-05-08"), + row("c_string", 10.0, 1.0, 0.5, null, null, null), + row("c_varchar", 10.0, 1.0, 0.5, null, null, null), + row("c_char", 9.0, 1.0, 0.5, null, null, null), + row("c_boolean", null, 1.0, 0.5, null, null, null), + row("c_binary", 9.0, null, 0.5, null, null, null), + row("p_bigint", null, 1.0, 0.0, null, "1", "1"), + row("p_varchar", 20.0, 1.0, 0.0, null, null, null), + row(null, null, null, null, 2.0, null, null))); assertThat(query(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_bigint = 2 AND p_varchar = 'partition2')", tableName))).containsOnly(ImmutableList.of( - row("c_tinyint", null, 1.0, 0.5, null, "99", "99", null), - row("c_smallint", null, 1.0, 0.5, null, "333", "333", null), - row("c_int", null, 1.0, 0.5, null, "444", "444", null), - row("c_bigint", null, 1.0, 0.5, null, "555", "555", null), - row("c_float", null, 1.0, 0.5, null, "666.34", "666.34", null), - row("c_double", null, 1.0, 0.5, null, "777.56", "777.56", null), - row("c_decimal", null, 1.0, 0.5, null, "888.0", "888.0", null), - row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67", "999.67", null), - row("c_timestamp", null, 1.0, 0.5, null, null, null, null), - row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-09", null), - row("c_string", 10.0, 1.0, 0.5, null, null, null, null), - row("c_varchar", 10.0, 1.0, 0.5, null, null, null, null), - row("c_char", 9.0, 1.0, 0.5, null, null, null, null), - row("c_boolean", null, 1.0, 0.5, null, null, null, null), - row("c_binary", 9.0, null, 0.5, null, null, null, null), - row("p_bigint", null, 1.0, 0.0, null, "2", "2", null), - row("p_varchar", 20.0, 1.0, 0.0, null, null, null, null), - row(null, null, null, null, 2.0, null, null, null))); + row("c_tinyint", null, 1.0, 0.5, null, "99", "99"), + row("c_smallint", null, 1.0, 0.5, null, "333", "333"), + row("c_int", null, 1.0, 0.5, null, "444", "444"), + row("c_bigint", null, 1.0, 0.5, null, "555", "555"), + row("c_float", null, 1.0, 0.5, null, "666.34", "666.34"), + row("c_double", null, 1.0, 0.5, null, "777.56", "777.56"), + row("c_decimal", null, 1.0, 0.5, null, "888.0", "888.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67", "999.67"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), + row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-09"), + row("c_string", 10.0, 1.0, 0.5, null, null, null), + row("c_varchar", 10.0, 1.0, 0.5, null, null, null), + row("c_char", 9.0, 1.0, 0.5, null, null, null), + row("c_boolean", null, 1.0, 0.5, null, null, null), + row("c_binary", 9.0, null, 0.5, null, null, null), + row("p_bigint", null, 1.0, 0.0, null, "2", "2"), + row("p_varchar", 20.0, 1.0, 0.0, null, null, null), + row(null, null, null, null, 2.0, null, null))); } finally { query(format("DROP TABLE IF EXISTS %s", tableName)); @@ -1246,90 +1246,90 @@ public void testComputePartitionStatisticsOnInsert() String showStatsPartitionTwo = format("SHOW STATS FOR (SELECT * FROM %s WHERE p_bigint = 2 AND p_varchar = 'partition2')", tableName); assertThat(query(showStatsPartitionOne)).containsOnly(ImmutableList.of( - row("c_tinyint", null, 1.0, 0.5, null, "120", "120", null), - row("c_smallint", null, 1.0, 0.5, null, "32760", "32760", null), - row("c_int", null, 1.0, 0.5, null, "2147483640", "2147483640", null), - row("c_bigint", null, 1.0, 0.5, null, "9223372036854775807", "9223372036854775807", null), - row("c_float", null, 1.0, 0.5, null, "123.34", "123.34", null), - row("c_double", null, 1.0, 0.5, null, "234.56", "234.56", null), - row("c_decimal", null, 1.0, 0.5, null, "343.0", "343.0", null), - row("c_decimal_w_params", null, 1.0, 0.5, null, "345.67", "345.67", null), - row("c_timestamp", null, 1.0, 0.5, null, null, null, null), - row("c_date", null, 1.0, 0.5, null, "2015-05-08", "2015-05-08", null), - row("c_string", 10.0, 1.0, 0.5, null, null, null, null), - row("c_varchar", 10.0, 1.0, 0.5, null, null, null, null), - row("c_char", 9.0, 1.0, 0.5, null, null, null, null), - row("c_boolean", null, 1.0, 0.5, null, null, null, null), - row("c_binary", 9.0, null, 0.5, null, null, null, null), - row("p_bigint", null, 1.0, 0.0, null, "1", "1", null), - row("p_varchar", 20.0, 1.0, 0.0, null, null, null, null), - row(null, null, null, null, 2.0, null, null, null))); + row("c_tinyint", null, 1.0, 0.5, null, "120", "120"), + row("c_smallint", null, 1.0, 0.5, null, "32760", "32760"), + row("c_int", null, 1.0, 0.5, null, "2147483640", "2147483640"), + row("c_bigint", null, 1.0, 0.5, null, "9223372036854775807", "9223372036854775807"), + row("c_float", null, 1.0, 0.5, null, "123.34", "123.34"), + row("c_double", null, 1.0, 0.5, null, "234.56", "234.56"), + row("c_decimal", null, 1.0, 0.5, null, "343.0", "343.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "345.67", "345.67"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), + row("c_date", null, 1.0, 0.5, null, "2015-05-08", "2015-05-08"), + row("c_string", 10.0, 1.0, 0.5, null, null, null), + row("c_varchar", 10.0, 1.0, 0.5, null, null, null), + row("c_char", 9.0, 1.0, 0.5, null, null, null), + row("c_boolean", null, 1.0, 0.5, null, null, null), + row("c_binary", 9.0, null, 0.5, null, null, null), + row("p_bigint", null, 1.0, 0.0, null, "1", "1"), + row("p_varchar", 20.0, 1.0, 0.0, null, null, null), + row(null, null, null, null, 2.0, null, null))); assertThat(query(showStatsPartitionTwo)).containsOnly(ImmutableList.of( - row("c_tinyint", null, 1.0, 0.5, null, "99", "99", null), - row("c_smallint", null, 1.0, 0.5, null, "333", "333", null), - row("c_int", null, 1.0, 0.5, null, "444", "444", null), - row("c_bigint", null, 1.0, 0.5, null, "555", "555", null), - row("c_float", null, 1.0, 0.5, null, "666.34", "666.34", null), - row("c_double", null, 1.0, 0.5, null, "777.56", "777.56", null), - row("c_decimal", null, 1.0, 0.5, null, "888.0", "888.0", null), - row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67", "999.67", null), - row("c_timestamp", null, 1.0, 0.5, null, null, null, null), - row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-09", null), - row("c_string", 10.0, 1.0, 0.5, null, null, null, null), - row("c_varchar", 10.0, 1.0, 0.5, null, null, null, null), - row("c_char", 9.0, 1.0, 0.5, null, null, null, null), - row("c_boolean", null, 1.0, 0.5, null, null, null, null), - row("c_binary", 9.0, null, 0.5, null, null, null, null), - row("p_bigint", null, 1.0, 0.0, null, "2", "2", null), - row("p_varchar", 20.0, 1.0, 0.0, null, null, null, null), - row(null, null, null, null, 2.0, null, null, null))); + row("c_tinyint", null, 1.0, 0.5, null, "99", "99"), + row("c_smallint", null, 1.0, 0.5, null, "333", "333"), + row("c_int", null, 1.0, 0.5, null, "444", "444"), + row("c_bigint", null, 1.0, 0.5, null, "555", "555"), + row("c_float", null, 1.0, 0.5, null, "666.34", "666.34"), + row("c_double", null, 1.0, 0.5, null, "777.56", "777.56"), + row("c_decimal", null, 1.0, 0.5, null, "888.0", "888.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67", "999.67"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), + row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-09"), + row("c_string", 10.0, 1.0, 0.5, null, null, null), + row("c_varchar", 10.0, 1.0, 0.5, null, null, null), + row("c_char", 9.0, 1.0, 0.5, null, null, null), + row("c_boolean", null, 1.0, 0.5, null, null, null), + row("c_binary", 9.0, null, 0.5, null, null, null), + row("p_bigint", null, 1.0, 0.0, null, "2", "2"), + row("p_varchar", 20.0, 1.0, 0.0, null, null, null), + row(null, null, null, null, 2.0, null, null))); query(format("INSERT INTO %s VALUES( TINYINT '119', SMALLINT '32759', INTEGER '2147483639', BIGINT '9223372036854775799', REAL '122.340', DOUBLE '233.560', CAST(342.0 AS DECIMAL(10, 0)), CAST(344.670 AS DECIMAL(10, 5)), TIMESTAMP '2015-05-10 12:15:29', DATE '2015-05-07', 'p1 varchar', CAST('p1 varchar10' AS VARCHAR(10)), CAST('p1 char10' AS CHAR(10)), true, CAST('p1 binary' as VARBINARY), BIGINT '1', 'partition1')", tableName)); query(format("INSERT INTO %s VALUES( null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, BIGINT '1', 'partition1')", tableName)); assertThat(query(showStatsPartitionOne)).containsOnly(ImmutableList.of( - row("c_tinyint", null, 1.0, 0.5, null, "119", "120", null), - row("c_smallint", null, 1.0, 0.5, null, "32759", "32760", null), - row("c_int", null, 1.0, 0.5, null, "2147483639", "2147483640", null), - row("c_bigint", null, 1.0, 0.5, null, "9223372036854775807", "9223372036854775807", null), - row("c_float", null, 1.0, 0.5, null, "122.34", "123.34", null), - row("c_double", null, 1.0, 0.5, null, "233.56", "234.56", null), - row("c_decimal", null, 1.0, 0.5, null, "342.0", "343.0", null), - row("c_decimal_w_params", null, 1.0, 0.5, null, "344.67", "345.67", null), - row("c_timestamp", null, 1.0, 0.5, null, null, null, null), - row("c_date", null, 1.0, 0.5, null, "2015-05-07", "2015-05-08", null), - row("c_string", 20.0, 1.0, 0.5, null, null, null, null), - row("c_varchar", 20.0, 1.0, 0.5, null, null, null, null), - row("c_char", 18.0, 1.0, 0.5, null, null, null, null), - row("c_boolean", null, 2.0, 0.5, null, null, null, null), - row("c_binary", 18.0, null, 0.5, null, null, null, null), - row("p_bigint", null, 1.0, 0.0, null, "1", "1", null), - row("p_varchar", 40.0, 1.0, 0.0, null, null, null, null), - row(null, null, null, null, 4.0, null, null, null))); + row("c_tinyint", null, 1.0, 0.5, null, "119", "120"), + row("c_smallint", null, 1.0, 0.5, null, "32759", "32760"), + row("c_int", null, 1.0, 0.5, null, "2147483639", "2147483640"), + row("c_bigint", null, 1.0, 0.5, null, "9223372036854775807", "9223372036854775807"), + row("c_float", null, 1.0, 0.5, null, "122.34", "123.34"), + row("c_double", null, 1.0, 0.5, null, "233.56", "234.56"), + row("c_decimal", null, 1.0, 0.5, null, "342.0", "343.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "344.67", "345.67"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), + row("c_date", null, 1.0, 0.5, null, "2015-05-07", "2015-05-08"), + row("c_string", 20.0, 1.0, 0.5, null, null, null), + row("c_varchar", 20.0, 1.0, 0.5, null, null, null), + row("c_char", 18.0, 1.0, 0.5, null, null, null), + row("c_boolean", null, 2.0, 0.5, null, null, null), + row("c_binary", 18.0, null, 0.5, null, null, null), + row("p_bigint", null, 1.0, 0.0, null, "1", "1"), + row("p_varchar", 40.0, 1.0, 0.0, null, null, null), + row(null, null, null, null, 4.0, null, null))); query(format("INSERT INTO %s VALUES( TINYINT '100', SMALLINT '334', INTEGER '445', BIGINT '556', REAL '667.340', DOUBLE '778.560', CAST(889.0 AS DECIMAL(10, 0)), CAST(1000.670 AS DECIMAL(10, 5)), TIMESTAMP '2015-05-10 12:45:31', DATE '2015-05-10', CAST('p2 varchar' AS VARCHAR), CAST('p2 varchar10' AS VARCHAR(10)), CAST('p2 char10' AS CHAR(10)), true, CAST('p2 binary' as VARBINARY), BIGINT '2', 'partition2')", tableName)); query(format("INSERT INTO %s VALUES( null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, BIGINT '2', 'partition2')", tableName)); assertThat(query(showStatsPartitionTwo)).containsOnly(ImmutableList.of( - row("c_tinyint", null, 1.0, 0.5, null, "99", "100", null), - row("c_smallint", null, 1.0, 0.5, null, "333", "334", null), - row("c_int", null, 1.0, 0.5, null, "444", "445", null), - row("c_bigint", null, 1.0, 0.5, null, "555", "556", null), - row("c_float", null, 1.0, 0.5, null, "666.34", "667.34", null), - row("c_double", null, 1.0, 0.5, null, "777.56", "778.56", null), - row("c_decimal", null, 1.0, 0.5, null, "888.0", "889.0", null), - row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67", "1000.67", null), - row("c_timestamp", null, 1.0, 0.5, null, null, null, null), - row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-10", null), - row("c_string", 20.0, 1.0, 0.5, null, null, null, null), - row("c_varchar", 20.0, 1.0, 0.5, null, null, null, null), - row("c_char", 18.0, 1.0, 0.5, null, null, null, null), - row("c_boolean", null, 1.0, 0.5, null, null, null, null), - row("c_binary", 18.0, null, 0.5, null, null, null, null), - row("p_bigint", null, 1.0, 0.0, null, "2", "2", null), - row("p_varchar", 40.0, 1.0, 0.0, null, null, null, null), - row(null, null, null, null, 4.0, null, null, null))); + row("c_tinyint", null, 1.0, 0.5, null, "99", "100"), + row("c_smallint", null, 1.0, 0.5, null, "333", "334"), + row("c_int", null, 1.0, 0.5, null, "444", "445"), + row("c_bigint", null, 1.0, 0.5, null, "555", "556"), + row("c_float", null, 1.0, 0.5, null, "666.34", "667.34"), + row("c_double", null, 1.0, 0.5, null, "777.56", "778.56"), + row("c_decimal", null, 1.0, 0.5, null, "888.0", "889.0"), + row("c_decimal_w_params", null, 1.0, 0.5, null, "999.67", "1000.67"), + row("c_timestamp", null, 1.0, 0.5, null, null, null), + row("c_date", null, 1.0, 0.5, null, "2015-05-09", "2015-05-10"), + row("c_string", 20.0, 1.0, 0.5, null, null, null), + row("c_varchar", 20.0, 1.0, 0.5, null, null, null), + row("c_char", 18.0, 1.0, 0.5, null, null, null), + row("c_boolean", null, 1.0, 0.5, null, null, null), + row("c_binary", 18.0, null, 0.5, null, null, null), + row("p_bigint", null, 1.0, 0.0, null, "2", "2"), + row("p_varchar", 40.0, 1.0, 0.0, null, null, null), + row(null, null, null, null, 4.0, null, null))); } finally { query(format("DROP TABLE IF EXISTS %s", tableName)); diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatisticType.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatisticType.java index 20b9261e7bc25..793c5acec9606 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatisticType.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatisticType.java @@ -21,8 +21,7 @@ public enum ColumnStatisticType NUMBER_OF_DISTINCT_VALUES("approx_distinct"), NUMBER_OF_NON_NULL_VALUES("count"), NUMBER_OF_TRUE_VALUES("count_if"), - TOTAL_SIZE_IN_BYTES("sum_data_size_for_stats"), - HISTOGRAM("tdigest_agg"); + TOTAL_SIZE_IN_BYTES("sum_data_size_for_stats"); private final String functionName; ColumnStatisticType(String functionName) diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java index ec4ee420e46bd..8ae5cfc25ce41 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java @@ -23,15 +23,13 @@ public final class ColumnStatistics { - private static final ColumnStatistics EMPTY = new ColumnStatistics(Estimate.unknown(), Estimate.unknown(), Estimate.unknown(), Optional.empty(), Optional.empty()); + private static final ColumnStatistics EMPTY = new ColumnStatistics(Estimate.unknown(), Estimate.unknown(), Estimate.unknown(), Optional.empty()); private final Estimate nullsFraction; private final Estimate distinctValuesCount; private final Estimate dataSize; private final Optional range; - private final Optional histogram; - public static ColumnStatistics empty() { return EMPTY; @@ -41,8 +39,7 @@ public ColumnStatistics( Estimate nullsFraction, Estimate distinctValuesCount, Estimate dataSize, - Optional range, - Optional histogram) + Optional range) { this.nullsFraction = requireNonNull(nullsFraction, "nullsFraction is null"); if (!nullsFraction.isUnknown()) { @@ -59,7 +56,6 @@ public ColumnStatistics( throw new IllegalArgumentException(format("dataSize must be greater than or equal to 0: %s", dataSize.getValue())); } this.range = requireNonNull(range, "range is null"); - this.histogram = requireNonNull(histogram, "histogram is null"); } @JsonProperty @@ -86,12 +82,6 @@ public Optional getRange() return range; } - @JsonProperty - public Optional getHistogram() - { - return histogram; - } - @Override public boolean equals(Object o) { @@ -105,14 +95,13 @@ public boolean equals(Object o) return Objects.equals(nullsFraction, that.nullsFraction) && Objects.equals(distinctValuesCount, that.distinctValuesCount) && Objects.equals(dataSize, that.dataSize) && - Objects.equals(range, that.range) && - Objects.equals(histogram, that.histogram); + Objects.equals(range, that.range); } @Override public int hashCode() { - return Objects.hash(nullsFraction, distinctValuesCount, dataSize, range, histogram); + return Objects.hash(nullsFraction, distinctValuesCount, dataSize, range); } @Override @@ -123,7 +112,6 @@ public String toString() ", distinctValuesCount=" + distinctValuesCount + ", dataSize=" + dataSize + ", range=" + range + - ", histogram=" + histogram + '}'; } @@ -136,8 +124,7 @@ public static Builder builder() * If one of the estimates below is unspecified, the default "unknown" estimate value * (represented by floating point NaN) may cause the resulting symbol statistics * to be "unknown" as well. - * - * @see VariableStatsEstimate + * @see SymbolStatsEstimate */ public static final class Builder { @@ -146,8 +133,6 @@ public static final class Builder private Estimate dataSize = Estimate.unknown(); private Optional range = Optional.empty(); - private Optional histogram = Optional.empty(); - public Builder setNullsFraction(Estimate nullsFraction) { this.nullsFraction = requireNonNull(nullsFraction, "nullsFraction is null"); @@ -193,40 +178,9 @@ public Builder setRange(Optional range) return this; } - public Builder setHistogram(Optional histogram) - { - this.histogram = histogram; - return this; - } - - public Builder mergeWith(Builder other) - { - if (nullsFraction.isUnknown()) { - this.nullsFraction = other.nullsFraction; - } - - if (distinctValuesCount.isUnknown()) { - this.distinctValuesCount = other.distinctValuesCount; - } - - if (dataSize.isUnknown()) { - this.dataSize = other.dataSize; - } - - if (!range.isPresent()) { - this.range = other.range; - } - - if (!histogram.isPresent()) { - this.histogram = other.histogram; - } - - return this; - } - public ColumnStatistics build() { - return new ColumnStatistics(nullsFraction, distinctValuesCount, dataSize, range, histogram); + return new ColumnStatistics(nullsFraction, distinctValuesCount, dataSize, range); } } } diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ConnectorHistogram.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ConnectorHistogram.java deleted file mode 100644 index 0febeb7f0d3fa..0000000000000 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ConnectorHistogram.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.facebook.presto.spi.statistics; - -import com.fasterxml.jackson.annotation.JsonTypeInfo; - -/** - * This interface contains functions which the Presto optimizer can use to - * answer questions about a particular column's data distribution. These - * functions will be used to return answers to the query optimizer to build - * more realistic cost models for joins and filter predicates. - *
- * Currently, this interface supports representing histograms of columns whose - * domains map to real values. - *
- * Null values should not be represented in underlying histogram implementation. - * When calculating filter statistics using the {@link ColumnStatisticType#NUMBER_OF_NON_NULL_VALUES} - * are used to account for nulls in cost-based calculations. - * - * @see ColumnStatisticType#NUMBER_OF_NON_NULL_VALUES - */ -@JsonTypeInfo(use = JsonTypeInfo.Id.MINIMAL_CLASS, property = "@class") -public interface ConnectorHistogram -{ - /** - * Calculates an estimate for the percentile at which a particular value - * falls in a distribution. - *
- * Put another way, this function returns the value of F(x) where F(x) - * represents the CDF of this particular distribution. Traditionally, the - * true CDF of a random variable X is represented by F(x) = P(x <= X). This - * function signature allows for a slight modification by using the - * {@code inclusive} parameter to return the value for F(x) = P(x < X) - * should the underlying implementation support it. - * - * @param value the value to calculate percentile - * @param inclusive whether this calculation should be inclusive or exclusive of the value (<= or <) - * @return an {@link Estimate} of the percentile - */ - Estimate cumulativeProbability(double value, boolean inclusive); - - /** - * Calculates the value which occurs at a particular percentile in the given - * distribution. - *
- * Put another way, calculates the inverse CDF. Given F(x) is the CDF of - * a particular distribution, this function computes F^(-1)(x). - * - * @param percentile the percentile. Must be in the range [0.0, 1.0] - * @return the value in the distribution corresponding to the percentile - */ - Estimate inverseCumulativeProbability(double percentile); -} diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java index aee762133df63..6ae1257eaedd4 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/Estimate.java @@ -21,8 +21,6 @@ import com.fasterxml.jackson.annotation.JsonProperty; import java.util.Objects; -import java.util.function.Function; -import java.util.function.Supplier; import static java.lang.Double.NaN; import static java.lang.Double.isInfinite; @@ -88,75 +86,6 @@ public double getValue() return value; } - /** - * If the estimate is not an unknown value, maps the current estimate using - * the given function. - * - * @param mapper mapping function - * @return a new estimate with the mapped value - */ - public Estimate map(Function mapper) - { - if (!isUnknown()) { - return Estimate.of(mapper.apply(value)); - } - return this; - } - - /** - * If the estimate is not unknown, maps the existing value where the mapping - * function should return a new estimate. - * - * @param mapper the mapping function - * @return a new estimate with the mapped value - */ - public Estimate flatMap(Function mapper) - { - if (!isUnknown()) { - return mapper.apply(value); - } - return this; - } - - /** - * If the estimate is unknown, run another function to generate an estimate - * - * @param supplier function to supply a new estimate - * @return a new estimate - */ - public Estimate or(Supplier supplier) - { - if (isUnknown()) { - return supplier.get(); - } - return this; - } - - /** - * If the estimate is unknown, run another function to generate an estimate - * - * @param supplier function to supply a new estimate - * @return a new estimate - */ - public double orElse(Supplier supplier) - { - if (isUnknown()) { - return supplier.get(); - } - return this.getValue(); - } - - public boolean fuzzyEquals(Estimate other, double tolerance) - { - if (equals(other)) { - return true; - } - if (isUnknown() || other.isUnknown()) { - return false; - } - return Math.copySign(value - other.value, 1.0) <= tolerance; - } - @Override public boolean equals(Object o) { diff --git a/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java b/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java index 730d87e30ce4e..a8a6d54296744 100644 --- a/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java +++ b/presto-tests/src/test/java/com/facebook/presto/tests/TestLocalQueries.java @@ -88,12 +88,12 @@ public void testShowColumnStats() MaterializedResult result = computeActual("SHOW STATS FOR nation"); MaterializedResult expectedStatistics = - resultBuilder(getSession(), VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR, VARCHAR) - .row("nationkey", null, 25.0, 0.0, null, "0", "24", null) - .row("name", 177.0, 25.0, 0.0, null, null, null, null) - .row("regionkey", null, 5.0, 0.0, null, "0", "4", null) - .row("comment", 1857.0, 25.0, 0.0, null, null, null, null) - .row(null, null, null, null, 25.0, null, null, null) + resultBuilder(getSession(), VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("nationkey", null, 25.0, 0.0, null, "0", "24") + .row("name", 177.0, 25.0, 0.0, null, null, null) + .row("regionkey", null, 5.0, 0.0, null, "0", "4") + .row("comment", 1857.0, 25.0, 0.0, null, null, null) + .row(null, null, null, null, 25.0, null, null) .build(); assertEquals(result, expectedStatistics); From f1507b68383c91343d607c1844fae483832860ba Mon Sep 17 00:00:00 2001 From: feilong-liu Date: Fri, 3 May 2024 10:33:41 -0700 Subject: [PATCH 2/4] Revert "Collect data size statistics for Iceberg tables" This reverts commit 457d81241134f69297bd9dfbb8c286a629b48dd9. --- .../src/main/sphinx/connector/iceberg.rst | 17 +- .../presto/iceberg/IcebergConfig.java | 17 +- .../presto/iceberg/IcebergHiveMetadata.java | 42 +-- .../iceberg/IcebergSessionProperties.java | 22 +- .../presto/iceberg/TableStatisticsMaker.java | 215 +++++--------- .../util/HiveStatisticsMergeStrategy.java | 37 +++ .../presto/iceberg/util/StatisticsUtil.java | 104 ++----- .../iceberg/IcebergDistributedTestBase.java | 105 ++----- .../presto/iceberg/TestIcebergConfig.java | 10 +- .../iceberg/TestIcebergTableChangelog.java | 4 +- .../presto/iceberg/TestStatisticsUtil.java | 274 ++++-------------- .../hive/TestIcebergDistributedHive.java | 5 +- .../hive/TestIcebergHiveStatistics.java | 74 +---- .../presto/sql/rewrite/ShowStatsRewrite.java | 10 +- .../spi/statistics/TableStatistics.java | 11 - 15 files changed, 267 insertions(+), 680 deletions(-) create mode 100644 presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/HiveStatisticsMergeStrategy.java diff --git a/presto-docs/src/main/sphinx/connector/iceberg.rst b/presto-docs/src/main/sphinx/connector/iceberg.rst index 67257daa5ddec..ec640a5f6ab84 100644 --- a/presto-docs/src/main/sphinx/connector/iceberg.rst +++ b/presto-docs/src/main/sphinx/connector/iceberg.rst @@ -225,13 +225,10 @@ Property Name Description ``iceberg.enable-parquet-dereference-pushdown`` Enable parquet dereference pushdown. ``true`` -``iceberg.hive-statistics-merge-strategy`` Comma separated list of statistics to use from the - Hive Metastore to override Iceberg table statistics. - The available values are ``NUMBER_OF_DISTINCT_VALUES`` - and ``TOTAL_SIZE_IN_BYTES``. - - **Note**: Only valid when the Iceberg connector is - configured with Hive. +``iceberg.hive-statistics-merge-strategy`` Determines how to merge statistics that are stored in the ``NONE`` + Hive Metastore. The available values are ``NONE``, + ``USE_NULLS_FRACTION_AND_NDV``, ``USE_NULLS_FRACTIONS`` + and, ``USE_NDV`` ``iceberg.statistic-snapshot-record-difference-weight`` The amount that the difference in total record count matters when calculating the closest snapshot when picking @@ -309,8 +306,6 @@ Property Name Description ============================================= ====================================================================== ``iceberg.delete_as_join_rewrite_enabled`` Overrides the behavior of the connector property ``iceberg.delete-as-join-rewrite-enabled`` in the current session. -``iceberg.hive_statistics_merge_strategy`` Overrides the behavior of the connector property - ``iceberg.hive-statistics-merge-strategy`` in the current session. ============================================= ====================================================================== Caching Support @@ -1177,7 +1172,7 @@ each Iceberg data type to the corresponding Presto data type, and from each Pres The following tables detail the specific type maps between PrestoDB and Iceberg. Iceberg to PrestoDB type mapping -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Map of Iceberg types to the relevant PrestoDB types: @@ -1220,7 +1215,7 @@ Map of Iceberg types to the relevant PrestoDB types: No other types are supported. PrestoDB to Iceberg type mapping -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Map of PrestoDB types to the relevant Iceberg types: diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java index 9b370b4d3576c..8bf6686c9ec70 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java @@ -16,7 +16,7 @@ import com.facebook.airlift.configuration.Config; import com.facebook.airlift.configuration.ConfigDescription; import com.facebook.presto.hive.HiveCompressionCodec; -import com.facebook.presto.spi.statistics.ColumnStatisticType; +import com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy; import com.google.common.base.Splitter; import com.google.common.collect.ImmutableList; import org.apache.iceberg.hadoop.HadoopFileIO; @@ -26,13 +26,11 @@ import javax.validation.constraints.Min; import javax.validation.constraints.NotNull; -import java.util.EnumSet; import java.util.List; import static com.facebook.presto.hive.HiveCompressionCodec.GZIP; import static com.facebook.presto.iceberg.CatalogType.HIVE; import static com.facebook.presto.iceberg.IcebergFileFormat.PARQUET; -import static com.facebook.presto.iceberg.util.StatisticsUtil.decodeMergeFlags; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_EXPIRATION_INTERVAL_MS_DEFAULT; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_MAX_CONTENT_LENGTH_DEFAULT; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_MAX_TOTAL_BYTES_DEFAULT; @@ -53,13 +51,12 @@ public class IcebergConfig private boolean pushdownFilterEnabled; private boolean deleteAsJoinRewriteEnabled = true; - private EnumSet hiveStatisticsMergeFlags = EnumSet.noneOf(ColumnStatisticType.class); + private HiveStatisticsMergeStrategy hiveStatisticsMergeStrategy = HiveStatisticsMergeStrategy.NONE; private String fileIOImpl = HadoopFileIO.class.getName(); private boolean manifestCachingEnabled; private long maxManifestCacheSize = IO_MANIFEST_CACHE_MAX_TOTAL_BYTES_DEFAULT; private long manifestCacheExpireDuration = IO_MANIFEST_CACHE_EXPIRATION_INTERVAL_MS_DEFAULT; private long manifestCacheMaxContentLength = IO_MANIFEST_CACHE_MAX_CONTENT_LENGTH_DEFAULT; - @NotNull public FileFormat getFileFormat() { @@ -198,16 +195,16 @@ public boolean isMergeOnReadModeEnabled() } @Config("iceberg.hive-statistics-merge-strategy") - @ConfigDescription("Comma separated list of statistics to use from the Hive metastore to override iceberg table statistics") - public IcebergConfig setHiveStatisticsMergeFlags(String mergeFlags) + @ConfigDescription("determines how to merge statistics that are stored in the Hive Metastore") + public IcebergConfig setHiveStatisticsMergeStrategy(HiveStatisticsMergeStrategy mergeStrategy) { - this.hiveStatisticsMergeFlags = decodeMergeFlags(mergeFlags); + this.hiveStatisticsMergeStrategy = mergeStrategy; return this; } - public EnumSet getHiveStatisticsMergeFlags() + public HiveStatisticsMergeStrategy getHiveStatisticsMergeStrategy() { - return hiveStatisticsMergeFlags; + return hiveStatisticsMergeStrategy; } @Config("iceberg.statistic-snapshot-record-difference-weight") diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java index 35f0107a7aa61..113ccc931da7e 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java @@ -34,6 +34,7 @@ import com.facebook.presto.hive.metastore.PrestoTableType; import com.facebook.presto.hive.metastore.PrincipalPrivileges; import com.facebook.presto.hive.metastore.Table; +import com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy; import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.ConnectorNewTableLayout; import com.facebook.presto.spi.ConnectorOutputTableHandle; @@ -56,9 +57,9 @@ import com.facebook.presto.spi.relation.VariableReferenceExpression; import com.facebook.presto.spi.security.PrestoPrincipal; import com.facebook.presto.spi.statistics.ColumnStatisticMetadata; -import com.facebook.presto.spi.statistics.ColumnStatisticType; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.ComputedStatistics; +import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatisticType; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.spi.statistics.TableStatisticsMetadata; @@ -81,14 +82,12 @@ import java.io.IOException; import java.time.ZoneId; import java.util.Collection; -import java.util.EnumSet; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.TimeZone; import java.util.stream.Collectors; -import java.util.stream.Stream; import static com.facebook.presto.hive.HiveStatisticsUtil.createPartitionStatistics; import static com.facebook.presto.hive.HiveStatisticsUtil.updatePartitionStatistics; @@ -123,7 +122,6 @@ import static com.facebook.presto.iceberg.IcebergUtil.toHiveColumns; import static com.facebook.presto.iceberg.IcebergUtil.tryGetProperties; import static com.facebook.presto.iceberg.PartitionFields.parsePartitionFields; -import static com.facebook.presto.iceberg.util.StatisticsUtil.calculateAndSetTableSize; import static com.facebook.presto.iceberg.util.StatisticsUtil.mergeHiveStatistics; import static com.facebook.presto.spi.StandardErrorCode.INVALID_SCHEMA_PROPERTY; import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; @@ -444,7 +442,7 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab IcebergTableHandle handle = (IcebergTableHandle) tableHandle; org.apache.iceberg.Table icebergTable = getIcebergTable(session, handle.getSchemaTableName()); TableStatistics icebergStatistics = TableStatisticsMaker.getTableStatistics(session, typeManager, constraint, handle, icebergTable, columnHandles.stream().map(IcebergColumnHandle.class::cast).collect(Collectors.toList())); - EnumSet mergeFlags = getHiveStatisticsMergeStrategy(session); + HiveStatisticsMergeStrategy mergeStrategy = getHiveStatisticsMergeStrategy(session); return tableLayoutHandle.map(IcebergTableLayoutHandle.class::cast).map(layoutHandle -> { TupleDomain domainPredicate = layoutHandle.getDomainPredicate() .transform(subfield -> isEntireColumn(subfield) ? subfield.getRootName() : null) @@ -456,24 +454,24 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab return new VariableReferenceExpression(Optional.empty(), columnHandle.getName(), columnHandle.getType()); }); RowExpression translatedPredicate = rowExpressionService.getDomainTranslator().toPredicate(predicate); - TableStatistics mergedStatistics = Optional.of(mergeFlags) - .filter(set -> !set.isEmpty()) - .map(flags -> { - PartitionStatistics hiveStatistics = metastore.getTableStatistics(getMetastoreContext(session), handle.getSchemaName(), handle.getIcebergTableName().getTableName()); - return mergeHiveStatistics(icebergStatistics, hiveStatistics, mergeFlags, icebergTable.spec()); - }) - .orElse(icebergStatistics); + PartitionStatistics hiveStatistics = metastore.getTableStatistics(getMetastoreContext(session), handle.getSchemaName(), handle.getIcebergTableName().getTableName()); + TableStatistics mergedStatistics = mergeHiveStatistics(icebergStatistics, hiveStatistics, mergeStrategy, icebergTable.spec()); TableStatistics.Builder filteredStatsBuilder = TableStatistics.builder() .setRowCount(mergedStatistics.getRowCount()); + double totalSize = 0; for (ColumnHandle colHandle : columnHandles) { IcebergColumnHandle icebergHandle = (IcebergColumnHandle) colHandle; if (mergedStatistics.getColumnStatistics().containsKey(icebergHandle)) { ColumnStatistics stats = mergedStatistics.getColumnStatistics().get(icebergHandle); filteredStatsBuilder.setColumnStatistics(icebergHandle, stats); + if (!stats.getDataSize().isUnknown()) { + totalSize += stats.getDataSize().getValue(); + } } } + filteredStatsBuilder.setTotalSize(Estimate.of(totalSize)); return filterStatsCalculatorService.filterStats( - calculateAndSetTableSize(filteredStatsBuilder).build(), + filteredStatsBuilder.build(), translatedPredicate, session, columnHandles.stream().map(IcebergColumnHandle.class::cast).collect(toImmutableMap( @@ -483,9 +481,9 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab IcebergColumnHandle::getName, IcebergColumnHandle::getType))); }).orElseGet(() -> { - if (!mergeFlags.isEmpty()) { + if (!mergeStrategy.equals(HiveStatisticsMergeStrategy.NONE)) { PartitionStatistics hiveStats = metastore.getTableStatistics(getMetastoreContext(session), handle.getSchemaName(), handle.getIcebergTableName().getTableName()); - return mergeHiveStatistics(icebergStatistics, hiveStats, mergeFlags, icebergTable.spec()); + return mergeHiveStatistics(icebergStatistics, hiveStats, mergeStrategy, icebergTable.spec()); } return icebergStatistics; }); @@ -502,17 +500,9 @@ public TableStatisticsMetadata getStatisticsCollectionMetadata(ConnectorSession { Set columnStatistics = tableMetadata.getColumns().stream() .filter(column -> !column.isHidden()) - .flatMap(meta -> { - try { - return metastore.getSupportedColumnStatistics(getMetastoreContext(session), meta.getType()) - .stream() - .map(statType -> statType.getColumnStatisticMetadata(meta.getName())); - } - // thrown in the case the type isn't supported by HMS statistics - catch (IllegalArgumentException e) { - return Stream.empty(); - } - }) + .flatMap(meta -> metastore.getSupportedColumnStatistics(getMetastoreContext(session), meta.getType()) + .stream() + .map(statType -> statType.getColumnStatisticMetadata(meta.getName()))) .collect(toImmutableSet()); Set tableStatistics = ImmutableSet.of(ROW_COUNT); diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java index 69061eedf99c8..fe4c51aa75492 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java @@ -18,10 +18,9 @@ import com.facebook.presto.hive.OrcFileWriterConfig; import com.facebook.presto.hive.ParquetFileWriterConfig; import com.facebook.presto.iceberg.nessie.NessieConfig; -import com.facebook.presto.iceberg.util.StatisticsUtil; +import com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy; import com.facebook.presto.spi.ConnectorSession; import com.facebook.presto.spi.session.PropertyMetadata; -import com.facebook.presto.spi.statistics.ColumnStatisticType; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableList; import io.airlift.units.DataSize; @@ -29,13 +28,10 @@ import javax.inject.Inject; -import java.util.EnumSet; import java.util.List; import static com.facebook.presto.common.type.VarcharType.VARCHAR; import static com.facebook.presto.common.type.VarcharType.createUnboundedVarcharType; -import static com.facebook.presto.iceberg.util.StatisticsUtil.SUPPORTED_MERGE_FLAGS; -import static com.facebook.presto.iceberg.util.StatisticsUtil.decodeMergeFlags; import static com.facebook.presto.spi.session.PropertyMetadata.booleanProperty; import static com.facebook.presto.spi.session.PropertyMetadata.doubleProperty; import static com.facebook.presto.spi.session.PropertyMetadata.integerProperty; @@ -160,14 +156,14 @@ public IcebergSessionProperties( false), new PropertyMetadata<>( HIVE_METASTORE_STATISTICS_MERGE_STRATEGY, - "Flags to choose which statistics from the Hive Metastore are used when calculating table stats. Valid values are: " - + Joiner.on(", ").join(SUPPORTED_MERGE_FLAGS), + "choose how to include statistics from the Hive Metastore when calculating table stats. Valid values are: " + + Joiner.on(", ").join(HiveStatisticsMergeStrategy.values()), VARCHAR, - EnumSet.class, - icebergConfig.getHiveStatisticsMergeFlags(), + HiveStatisticsMergeStrategy.class, + icebergConfig.getHiveStatisticsMergeStrategy(), false, - val -> decodeMergeFlags((String) val), - StatisticsUtil::encodeMergeFlags), + val -> HiveStatisticsMergeStrategy.valueOf((String) val), + HiveStatisticsMergeStrategy::name), booleanProperty( PUSHDOWN_FILTER_ENABLED, "Experimental: Enable Filter Pushdown for Iceberg. This is only supported with Native Worker.", @@ -276,9 +272,9 @@ public static boolean isMergeOnReadModeEnabled(ConnectorSession session) return session.getProperty(MERGE_ON_READ_MODE_ENABLED, Boolean.class); } - public static EnumSet getHiveStatisticsMergeStrategy(ConnectorSession session) + public static HiveStatisticsMergeStrategy getHiveStatisticsMergeStrategy(ConnectorSession session) { - return session.getProperty(HIVE_METASTORE_STATISTICS_MERGE_STRATEGY, EnumSet.class); + return session.getProperty(HIVE_METASTORE_STATISTICS_MERGE_STRATEGY, HiveStatisticsMergeStrategy.class); } public static boolean isPushdownFilterEnabled(ConnectorSession session) diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java index 557addb1b4511..66c325a5df7db 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java @@ -14,7 +14,6 @@ package com.facebook.presto.iceberg; import com.facebook.airlift.log.Logger; -import com.facebook.presto.common.block.Block; import com.facebook.presto.common.predicate.TupleDomain; import com.facebook.presto.common.type.FixedWidthType; import com.facebook.presto.common.type.TypeManager; @@ -23,7 +22,6 @@ import com.facebook.presto.spi.Constraint; import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.statistics.ColumnStatisticMetadata; -import com.facebook.presto.spi.statistics.ColumnStatisticType; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.ComputedStatistics; import com.facebook.presto.spi.statistics.DoubleRange; @@ -46,17 +44,13 @@ import org.apache.iceberg.TableScan; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.puffin.Blob; -import org.apache.iceberg.puffin.BlobMetadata; import org.apache.iceberg.puffin.Puffin; -import org.apache.iceberg.puffin.PuffinReader; import org.apache.iceberg.puffin.PuffinWriter; import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; import java.io.IOException; import java.io.UncheckedIOException; @@ -74,7 +68,6 @@ import java.util.function.Predicate; import java.util.stream.Collectors; -import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.common.type.DateType.DATE; import static com.facebook.presto.common.type.TimestampType.TIMESTAMP; import static com.facebook.presto.common.type.TimestampWithTimeZoneType.TIMESTAMP_WITH_TIME_ZONE; @@ -87,9 +80,8 @@ import static com.facebook.presto.iceberg.IcebergSessionProperties.getStatisticSnapshotRecordDifferenceWeight; import static com.facebook.presto.iceberg.IcebergUtil.getIdentityPartitions; import static com.facebook.presto.iceberg.Partition.toMap; -import static com.facebook.presto.iceberg.util.StatisticsUtil.calculateAndSetTableSize; +import static com.facebook.presto.iceberg.TypeConverter.toPrestoType; import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.Iterables.getOnlyElement; import static java.lang.Long.parseLong; @@ -103,9 +95,7 @@ public class TableStatisticsMaker { private static final Logger log = Logger.get(TableStatisticsMaker.class); private static final String ICEBERG_THETA_SKETCH_BLOB_TYPE_ID = "apache-datasketches-theta-v1"; - private static final String ICEBERG_DATA_SIZE_BLOB_TYPE_ID = "presto-sum-data-size-bytes-v1"; private static final String ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY = "ndv"; - private static final String ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY = "data_size"; private final Table icebergTable; private final ConnectorSession session; private final TypeManager typeManager; @@ -117,16 +107,6 @@ private TableStatisticsMaker(Table icebergTable, ConnectorSession session, TypeM this.typeManager = typeManager; } - private static final Map puffinStatWriters = ImmutableMap.builder() - .put(NUMBER_OF_DISTINCT_VALUES, TableStatisticsMaker::generateNDVBlob) - .put(TOTAL_SIZE_IN_BYTES, TableStatisticsMaker::generateStatSizeBlob) - .build(); - - private static final Map puffinStatReaders = ImmutableMap.builder() - .put(ICEBERG_THETA_SKETCH_BLOB_TYPE_ID, TableStatisticsMaker::readNDVBlob) - .put(ICEBERG_DATA_SIZE_BLOB_TYPE_ID, TableStatisticsMaker::readDataSizeBlob) - .build(); - public static TableStatistics getTableStatistics(ConnectorSession session, TypeManager typeManager, Constraint constraint, IcebergTableHandle tableHandle, Table icebergTable, List columns) { return new TableStatisticsMaker(icebergTable, session, typeManager).makeTableStatistics(tableHandle, constraint, columns); @@ -186,13 +166,9 @@ private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Cons double recordCount = summary.getRecordCount(); TableStatistics.Builder result = TableStatistics.builder(); result.setRowCount(Estimate.of(recordCount)); - + result.setTotalSize(Estimate.of(summary.getSize())); Map tableStats = getClosestStatisticsFileForSnapshot(tableHandle) - .map(this::loadStatisticsFile).orElseGet(Collections::emptyMap); - // scale all NDV values loaded from puffin files based on row count - totalRecordCount.ifPresent(fullTableRecordCount -> tableStats.forEach((id, stat) -> - stat.setDistinctValuesCount(stat.getDistinctValuesCount().map(value -> value * recordCount / fullTableRecordCount)))); - + .map(TableStatisticsMaker::loadStatisticsFile).orElseGet(Collections::emptyMap); for (IcebergColumnHandle columnHandle : selectedColumns) { int fieldId = columnHandle.getId(); ColumnStatistics.Builder columnBuilder = tableStats.getOrDefault(fieldId, ColumnStatistics.builder()); @@ -200,7 +176,12 @@ private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Cons if (nullCount != null) { columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount)); } - + if (summary.getColumnSizes() != null) { + Long columnSize = summary.getColumnSizes().get(fieldId); + if (columnSize != null) { + columnBuilder.setDataSize(Estimate.of(columnSize)); + } + } Object min = summary.getMinValues().get(fieldId); Object max = summary.getMaxValues().get(fieldId); if (min instanceof Number && max instanceof Number) { @@ -208,7 +189,7 @@ private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Cons } result.setColumnStatistics(columnHandle, columnBuilder.build()); } - return calculateAndSetTableSize(result).build(); + return result.build(); } private Partition getDataTableSummary(IcebergTableHandle tableHandle, @@ -262,6 +243,7 @@ private Partition getSummaryFromFiles(CloseableIterable> files, toMap(idToTypeMapping, contentFile.upperBounds()), contentFile.nullValueCounts(), new HashMap<>()); + updateColumnSizes(summary, contentFile.columnSizes()); } else { summary.incrementFileCount(); @@ -270,6 +252,7 @@ private Partition getSummaryFromFiles(CloseableIterable> files, updateSummaryMin(summary, partitionFields, toMap(idToTypeMapping, contentFile.lowerBounds()), contentFile.nullValueCounts(), contentFile.recordCount()); updateSummaryMax(summary, partitionFields, toMap(idToTypeMapping, contentFile.upperBounds()), contentFile.nullValueCounts(), contentFile.recordCount()); summary.updateNullCount(contentFile.nullValueCounts()); + updateColumnSizes(summary, contentFile.columnSizes()); } } } @@ -303,10 +286,24 @@ private void writeTableStatistics(NodeVersion nodeVersion, IcebergTableHandle ta .flatMap(map -> map.entrySet().stream()) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)) .forEach((key, value) -> { - Optional.ofNullable(puffinStatWriters.get(key.getStatisticType())) - .ifPresent(generator -> { - writer.add(generator.generate(key, value, icebergTable, snapshot)); - }); + if (!key.getStatisticType().equals(NUMBER_OF_DISTINCT_VALUES)) { + return; + } + Optional id = Optional.ofNullable(icebergTable.schema().findField(key.getColumnName())).map(Types.NestedField::fieldId); + if (!id.isPresent()) { + log.warn("failed to find column name %s in schema of table %s when writing distinct value statistics", key.getColumnName(), icebergTable.name()); + throw new PrestoException(ICEBERG_INVALID_METADATA, format("failed to find column name %s in schema of table %s when writing distinct value statistics", key.getColumnName(), icebergTable.name())); + } + ByteBuffer raw = VARBINARY.getSlice(value, 0).toByteBuffer(); + CompactSketch sketch = CompactSketch.wrap(Memory.wrap(raw, ByteOrder.nativeOrder())); + writer.add(new Blob( + ICEBERG_THETA_SKETCH_BLOB_TYPE_ID, + ImmutableList.of(id.get()), + snapshot.snapshotId(), + snapshot.sequenceNumber(), + raw, + null, + ImmutableMap.of(ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY, Long.toString((long) sketch.getEstimate())))); }); writer.finish(); icebergTable.updateStatistics().setStatistics( @@ -328,87 +325,27 @@ private void writeTableStatistics(NodeVersion nodeVersion, IcebergTableHandle ta } } - @FunctionalInterface - private interface PuffinBlobGenerator - { - Blob generate(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot); - } - - @FunctionalInterface - private interface PuffinBlobReader - { - /** - * Reads the stats from the blob and then updates the stats builder argument. - */ - void read(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder stats); - } - - private static Blob generateNDVBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot) - { - int id = getFieldId(metadata, icebergTable); - ByteBuffer raw = VARBINARY.getSlice(value, 0).toByteBuffer(); - CompactSketch sketch = CompactSketch.wrap(Memory.wrap(raw, ByteOrder.nativeOrder())); - return new Blob( - ICEBERG_THETA_SKETCH_BLOB_TYPE_ID, - ImmutableList.of(id), - snapshot.snapshotId(), - snapshot.sequenceNumber(), - raw, - null, - ImmutableMap.of(ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY, Long.toString((long) sketch.getEstimate()))); - } - - private static Blob generateStatSizeBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot) - { - int id = getFieldId(metadata, icebergTable); - long size = BIGINT.getLong(value, 0); - return new Blob( - ICEBERG_DATA_SIZE_BLOB_TYPE_ID, - ImmutableList.of(id), - snapshot.snapshotId(), - snapshot.sequenceNumber(), - ByteBuffer.allocate(0), // empty bytebuffer since the value is just stored on the blob properties - null, - ImmutableMap.of(ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY, Long.toString(size))); - } - - private static void readNDVBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics) + public void updateColumnSizes(Partition summary, Map addedColumnSizes) { - Optional.ofNullable(metadata.properties().get(ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY)) - .ifPresent(ndvProp -> { - try { - long ndv = parseLong(ndvProp); - statistics.setDistinctValuesCount(Estimate.of(ndv)); - } - catch (NumberFormatException e) { - statistics.setDistinctValuesCount(Estimate.unknown()); - log.warn("bad long value when parsing NDVs for statistics file blob %s. bad value: %d", metadata.type(), ndvProp); - } - }); - } - - private static void readDataSizeBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics) - { - Optional.ofNullable(metadata.properties().get(ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY)) - .ifPresent(sizeProp -> { - try { - long size = parseLong(sizeProp); - statistics.setDataSize(Estimate.of(size)); - } - catch (NumberFormatException e) { - statistics.setDataSize(Estimate.unknown()); - log.warn("bad long value when parsing data size from statistics file blob %s. bad value: %d", metadata.type(), sizeProp); - } - }); - } - - private static int getFieldId(ColumnStatisticMetadata metadata, Table icebergTable) - { - return Optional.ofNullable(icebergTable.schema().findField(metadata.getColumnName())).map(Types.NestedField::fieldId) - .orElseThrow(() -> { - log.warn("failed to find column name %s in schema of table %s", metadata.getColumnName(), icebergTable.name()); - return new PrestoException(ICEBERG_INVALID_METADATA, format("failed to find column name %s in schema of table %s", metadata.getColumnName(), icebergTable.name())); - }); + Map columnSizes = summary.getColumnSizes(); + if (!summary.hasValidColumnMetrics() || columnSizes == null || addedColumnSizes == null) { + return; + } + for (Types.NestedField column : summary.getNonPartitionPrimitiveColumns()) { + int id = column.fieldId(); + com.facebook.presto.common.type.Type type = toPrestoType(column.type(), typeManager); + // allow the optimizer to infer the size of fixed-width types + // since it can be calculated accurately without collecting stats. + if (type instanceof FixedWidthType) { + continue; + } + columnSizes.compute(id, (key, value) -> { + if (value == null) { + value = 0L; + } + return value + addedColumnSizes.getOrDefault(id, 0L); + }); + } } private void updateSummaryMin(Partition summary, List partitionFields, Map lowerBounds, Map nullCounts, long recordCount) @@ -491,35 +428,29 @@ private Optional getClosestStatisticsFileForSnapshot(IcebergTabl /** * Builds a map of field ID to ColumnStatistics for a particular {@link StatisticsFile}. + * + * @return */ - private Map loadStatisticsFile(StatisticsFile file) + private static Map loadStatisticsFile(StatisticsFile file) { - Map result = new HashMap<>(); - try (FileIO io = icebergTable.io()) { - InputFile inputFile = io.newInputFile(file.path()); - try (PuffinReader reader = Puffin.read(inputFile).build()) { - for (Pair data : reader.readAll(reader.fileMetadata().blobs())) { - BlobMetadata metadata = data.first(); - ByteBuffer blob = data.second(); - Integer field = getOnlyElement(metadata.inputFields()); - Optional.ofNullable(puffinStatReaders.get(metadata.type())) - .ifPresent(statReader -> { - result.compute(field, (key, value) -> { - if (value == null) { - value = ColumnStatistics.builder(); - } - statReader.read(metadata, blob, value); - return value; - }); - }); - } - } - catch (IOException e) { - throw new PrestoException(ICEBERG_FILESYSTEM_ERROR, "failed to read statistics file at " + file.path(), e); - } - } - - return ImmutableMap.copyOf(result); + ImmutableMap.Builder result = ImmutableMap.builder(); + file.blobMetadata().forEach(blob -> { + Integer field = getOnlyElement(blob.fields()); + ColumnStatistics.Builder colStats = ColumnStatistics.builder(); + Optional.ofNullable(blob.properties().get(ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY)) + .ifPresent(ndvProp -> { + try { + long ndv = parseLong(ndvProp); + colStats.setDistinctValuesCount(Estimate.of(ndv)); + } + catch (NumberFormatException e) { + colStats.setDistinctValuesCount(Estimate.unknown()); + log.warn("bad long value when parsing statistics file %s, bad value: %d", file.path(), ndvProp); + } + }); + result.put(field, colStats); + }); + return result.build(); } public static List getSupportedColumnStatistics(String columnName, com.facebook.presto.common.type.Type type) @@ -532,10 +463,6 @@ public static List getSupportedColumnStatistics(String supportedStatistics.add(NUMBER_OF_DISTINCT_VALUES.getColumnStatisticMetadataWithCustomFunction(columnName, "sketch_theta")); } - if (!(type instanceof FixedWidthType)) { - supportedStatistics.add(TOTAL_SIZE_IN_BYTES.getColumnStatisticMetadata(columnName)); - } - return supportedStatistics.build(); } } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/HiveStatisticsMergeStrategy.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/HiveStatisticsMergeStrategy.java new file mode 100644 index 0000000000000..419a04cad263f --- /dev/null +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/HiveStatisticsMergeStrategy.java @@ -0,0 +1,37 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.iceberg.util; + +/** + * strategies that define how to merge hive column statistics into Iceberg column statistics. + */ +public enum HiveStatisticsMergeStrategy +{ + /** + * Do not merge statistics from Hive + */ + NONE, + /** + * Only merge NDV statistics from hive + */ + USE_NDV, + /** + * Only merge null fractions from hive + */ + USE_NULLS_FRACTIONS, + /** + * Merge both null fractions and NDVs from Hive + */ + USE_NULLS_FRACTION_AND_NDV, +} diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java index a6dcb93ad98c1..c64b1218fdfd7 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java @@ -13,29 +13,20 @@ */ package com.facebook.presto.iceberg.util; -import com.facebook.presto.common.type.FixedWidthType; import com.facebook.presto.hive.metastore.HiveColumnStatistics; import com.facebook.presto.hive.metastore.PartitionStatistics; import com.facebook.presto.iceberg.IcebergColumnHandle; -import com.facebook.presto.spi.PrestoException; -import com.facebook.presto.spi.statistics.ColumnStatisticType; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; -import com.google.common.base.Joiner; -import com.google.common.collect.ImmutableSet; import org.apache.iceberg.PartitionSpec; -import java.util.Arrays; -import java.util.EnumSet; import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; -import static com.facebook.presto.spi.StandardErrorCode.INVALID_ARGUMENTS; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; +import static com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy.NONE; +import static com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy.USE_NDV; +import static com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy.USE_NULLS_FRACTIONS; +import static com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy.USE_NULLS_FRACTION_AND_NDV; public final class StatisticsUtil { @@ -43,27 +34,15 @@ private StatisticsUtil() { } - public static final Set SUPPORTED_MERGE_FLAGS = ImmutableSet.builder() - .add(NUMBER_OF_DISTINCT_VALUES) - .add(TOTAL_SIZE_IN_BYTES) - .build(); - - /** - * Attempts to merge statistics from Iceberg and Hive tables. - *
- * Statistics from Hive are only merged if the corresponding flag exists in - * the {@code mergeFlags}. - */ - public static TableStatistics mergeHiveStatistics(TableStatistics icebergStatistics, PartitionStatistics hiveStatistics, EnumSet mergeFlags, PartitionSpec spec) + public static TableStatistics mergeHiveStatistics(TableStatistics icebergStatistics, PartitionStatistics hiveStatistics, HiveStatisticsMergeStrategy mergeStrategy, PartitionSpec spec) { - if (spec.isPartitioned()) { + if (mergeStrategy.equals(NONE) || spec.isPartitioned()) { return icebergStatistics; } - // We really only need to merge in NDVs and data size from the column statistics in hive's - // stats + // We really only need to merge in NDVs and null fractions from the column statistics in hive's stats // - // Always take iceberg row count, nulls, and min/max statistics over hive's as they are - // always up-to-date since they are computed and stored in manifests on writes + // take iceberg data and column size, and row count statistics over hive's as they're more likely + // to be up to date since they are computed on the fly Map columnStats = hiveStatistics.getColumnStatistics(); TableStatistics.Builder statsBuilder = TableStatistics.builder(); statsBuilder.setTotalSize(icebergStatistics.getTotalSize()); @@ -78,58 +57,29 @@ public static TableStatistics mergeHiveStatistics(TableStatistics icebergStatist .setDistinctValuesCount(icebergColumnStats.getDistinctValuesCount()) .setRange(icebergColumnStats.getRange()); if (hiveColumnStats != null) { - // NDVs - if (mergeFlags.contains(NUMBER_OF_DISTINCT_VALUES)) { + if (mergeStrategy.equals(USE_NDV) || mergeStrategy.equals(USE_NULLS_FRACTION_AND_NDV)) { hiveColumnStats.getDistinctValuesCount().ifPresent(ndvs -> mergedStats.setDistinctValuesCount(Estimate.of(ndvs))); } - // data size - if (mergeFlags.contains(ColumnStatisticType.TOTAL_SIZE_IN_BYTES)) { - hiveColumnStats.getTotalSizeInBytes().ifPresent(size -> mergedStats.setDataSize(Estimate.of(size))); + if (mergeStrategy.equals(USE_NULLS_FRACTIONS) || mergeStrategy.equals(USE_NULLS_FRACTION_AND_NDV)) { + hiveColumnStats.getNullsCount().ifPresent(nullCount -> { + Estimate nullsFraction; + if (!hiveStatistics.getBasicStatistics().getRowCount().isPresent()) { + if (icebergStatistics.getRowCount().isUnknown()) { + nullsFraction = Estimate.unknown(); + } + else { + nullsFraction = Estimate.of((double) nullCount / icebergStatistics.getRowCount().getValue()); + } + } + else { + nullsFraction = Estimate.of((double) nullCount / hiveStatistics.getBasicStatistics().getRowCount().getAsLong()); + } + mergedStats.setNullsFraction(nullsFraction); + }); } } statsBuilder.setColumnStatistics(columnHandle, mergedStats.build()); }); - return calculateAndSetTableSize(statsBuilder).build(); - } - - public static EnumSet decodeMergeFlags(String input) - { - return Optional.of(Arrays.stream((input).trim().split(",")) - .filter(value -> !value.isEmpty()) - .map(ColumnStatisticType::valueOf) - .peek(statistic -> { - if (!SUPPORTED_MERGE_FLAGS.contains(statistic)) { - throw new PrestoException(INVALID_ARGUMENTS, "merge flags may only include " + SUPPORTED_MERGE_FLAGS); - } - }) - .collect(Collectors.toSet())) - .filter(set -> !set.isEmpty()) - .map(EnumSet::copyOf) - .orElse(EnumSet.noneOf(ColumnStatisticType.class)); - } - - public static String encodeMergeFlags(EnumSet flags) - { - return Joiner.on(",").join(flags.stream().peek(statistic -> { - if (!SUPPORTED_MERGE_FLAGS.contains(statistic)) { - throw new PrestoException(INVALID_ARGUMENTS, "merge flags may only include " + SUPPORTED_MERGE_FLAGS); - } - }).map(Enum::name).iterator()); - } - - public static TableStatistics.Builder calculateAndSetTableSize(TableStatistics.Builder builder) - { - return builder.setTotalSize(builder.getRowCount().flatMap(rowCount -> builder.getColumnStatistics().entrySet().stream().map(entry -> { - IcebergColumnHandle columnHandle = (IcebergColumnHandle) entry.getKey(); - ColumnStatistics stats = entry.getValue(); - return stats.getDataSize().or(() -> { - if (columnHandle.getType() instanceof FixedWidthType) { - return stats.getNullsFraction().map(nulls -> rowCount * (1 - nulls) * ((FixedWidthType) columnHandle.getType()).getFixedSize()); - } - else { - return Estimate.unknown(); - } - }); - }).reduce(Estimate.of(0.0), (currentSize, newSize) -> currentSize.flatMap(current -> newSize.map(add -> current + add))))); + return statsBuilder.build(); } } diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java index 47a9af0243864..258bfefc16065 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java @@ -31,7 +31,6 @@ import com.facebook.presto.hive.authentication.NoHdfsAuthentication; import com.facebook.presto.iceberg.delete.DeleteFile; import com.facebook.presto.metadata.Metadata; -import com.facebook.presto.spi.ColumnHandle; import com.facebook.presto.spi.Constraint; import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.TableHandle; @@ -635,92 +634,49 @@ public void testStringFilters() } @Test - public void testReadWriteStats() + public void testReadWriteNDVs() { - assertUpdate("CREATE TABLE test_stats (col0 int, col1 varchar)"); - assertTrue(getQueryRunner().tableExists(getSession(), "test_stats")); - assertTableColumnNames("test_stats", "col0", "col1"); + assertUpdate("CREATE TABLE test_stat_ndv (col0 int)"); + assertTrue(getQueryRunner().tableExists(getSession(), "test_stat_ndv")); + assertTableColumnNames("test_stat_ndv", "col0"); // test that stats don't exist before analyze - Function, Map> remapper = (input) -> input.entrySet().stream().collect(Collectors.toMap(e -> ((IcebergColumnHandle) e.getKey()).getName(), Map.Entry::getValue)); - Map columnStats; - TableStatistics stats = getTableStats("test_stats"); - columnStats = remapper.apply(stats.getColumnStatistics()); - assertTrue(columnStats.isEmpty()); + TableStatistics stats = getTableStats("test_stat_ndv"); + assertTrue(stats.getColumnStatistics().isEmpty()); // test after simple insert we get a good estimate - assertUpdate("INSERT INTO test_stats VALUES (1, 'abc'), (2, 'xyz'), (3, 'lmnopqrst')", 3); - getQueryRunner().execute("ANALYZE test_stats"); - stats = getTableStats("test_stats"); - columnStats = remapper.apply(stats.getColumnStatistics()); - ColumnStatistics columnStat = columnStats.get("col0"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(3.0)); - assertEquals(columnStat.getDataSize(), Estimate.unknown()); - columnStat = columnStats.get("col1"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(3.0)); - double dataSize = (double) (long) getQueryRunner().execute("SELECT sum_data_size_for_stats(col1) FROM test_stats").getOnlyValue(); - assertEquals(columnStat.getDataSize().getValue(), dataSize); + assertUpdate("INSERT INTO test_stat_ndv VALUES 1, 2, 3", 3); + getQueryRunner().execute("ANALYZE test_stat_ndv"); + stats = getTableStats("test_stat_ndv"); + assertEquals(stats.getColumnStatistics().values().stream().findFirst().get().getDistinctValuesCount(), Estimate.of(3.0)); // test after inserting the same values, we still get the same estimate - assertUpdate("INSERT INTO test_stats VALUES (1, 'abc'), (2, 'xyz'), (3, 'lmnopqrst')", 3); - stats = getTableStats("test_stats"); - columnStats = remapper.apply(stats.getColumnStatistics()); - columnStat = columnStats.get("col0"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(3.0)); - assertEquals(columnStat.getDataSize(), Estimate.unknown()); - columnStat = columnStats.get("col1"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(3.0)); - assertEquals(columnStat.getDataSize().getValue(), dataSize); - - // test after ANALYZING with the new inserts that the NDV estimate is the same and the data size matches - getQueryRunner().execute("ANALYZE test_stats"); - stats = getTableStats("test_stats"); - columnStats = remapper.apply(stats.getColumnStatistics()); - columnStat = columnStats.get("col0"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(3.0)); - assertEquals(columnStat.getDataSize(), Estimate.unknown()); - columnStat = columnStats.get("col1"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(3.0)); - dataSize = (double) (long) getQueryRunner().execute("SELECT sum_data_size_for_stats(col1) FROM test_stats").getOnlyValue(); - assertEquals(columnStat.getDataSize().getValue(), dataSize); + assertUpdate("INSERT INTO test_stat_ndv VALUES 1, 2, 3", 3); + stats = getTableStats("test_stat_ndv"); + assertEquals(stats.getColumnStatistics().values().stream().findFirst().get().getDistinctValuesCount(), Estimate.of(3.0)); + + // test after ANALYZING with the new inserts that the NDV estimate is the same + getQueryRunner().execute("ANALYZE test_stat_ndv"); + stats = getTableStats("test_stat_ndv"); + assertEquals(stats.getColumnStatistics().values().stream().findFirst().get().getDistinctValuesCount(), Estimate.of(3.0)); // test after inserting a new value, but not analyzing, the estimate is the same. - assertUpdate("INSERT INTO test_stats VALUES (4, 'def')", 1); - stats = getTableStats("test_stats"); - columnStats = remapper.apply(stats.getColumnStatistics()); - columnStat = columnStats.get("col0"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(3.0)); - assertEquals(columnStat.getDataSize(), Estimate.unknown()); - columnStat = columnStats.get("col1"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(3.0)); - assertEquals(columnStat.getDataSize().getValue(), dataSize); + assertUpdate("INSERT INTO test_stat_ndv VALUES 4", 1); + stats = getTableStats("test_stat_ndv"); + assertEquals(stats.getColumnStatistics().values().stream().findFirst().get().getDistinctValuesCount(), Estimate.of(3.0)); // test that after analyzing, the updates stats show up. - getQueryRunner().execute("ANALYZE test_stats"); - stats = getTableStats("test_stats"); - columnStats = remapper.apply(stats.getColumnStatistics()); - columnStat = columnStats.get("col0"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(4.0)); - assertEquals(columnStat.getDataSize(), Estimate.unknown()); - columnStat = columnStats.get("col1"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(4.0)); - dataSize = (double) (long) getQueryRunner().execute("SELECT sum_data_size_for_stats(col1) FROM test_stats").getOnlyValue(); - assertEquals(columnStat.getDataSize().getValue(), dataSize); + getQueryRunner().execute("ANALYZE test_stat_ndv"); + stats = getTableStats("test_stat_ndv"); + assertEquals(stats.getColumnStatistics().values().stream().findFirst().get().getDistinctValuesCount(), Estimate.of(4.0)); // test adding a null value is successful, and analyze still runs successfully - assertUpdate("INSERT INTO test_stats VALUES (NULL, NULL)", 1); - assertQuerySucceeds("ANALYZE test_stats"); - stats = getTableStats("test_stats"); - columnStats = remapper.apply(stats.getColumnStatistics()); - columnStat = columnStats.get("col0"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(4.0)); - assertEquals(columnStat.getDataSize(), Estimate.unknown()); - columnStat = columnStats.get("col1"); - assertEquals(columnStat.getDistinctValuesCount(), Estimate.of(4.0)); - dataSize = (double) (long) getQueryRunner().execute("SELECT sum_data_size_for_stats(col1) FROM test_stats").getOnlyValue(); - assertEquals(columnStat.getDataSize().getValue(), dataSize); - - assertUpdate("DROP TABLE test_stats"); + assertUpdate("INSERT INTO test_stat_ndv VALUES NULL", 1); + assertQuerySucceeds("ANALYZE test_stat_ndv"); + stats = getTableStats("test_stat_ndv"); + assertEquals(stats.getColumnStatistics().values().stream().findFirst().get().getDistinctValuesCount(), Estimate.of(4.0)); + + assertUpdate("DROP TABLE test_stat_ndv"); } @Test @@ -862,7 +818,6 @@ public void testStatsDataSizePrimitives() { assertUpdate("CREATE TABLE test_stat_data_size (c0 int, c1 bigint, c2 double, c3 decimal(4, 0), c4 varchar, c5 varchar(10), c6 date, c7 time, c8 timestamp, c10 boolean)"); assertUpdate("INSERT INTO test_stat_data_size VALUES (0, 1, 2.0, CAST(4.01 as decimal(4, 0)), 'testvc', 'testvc10', date '2024-03-14', localtime, localtimestamp, TRUE)", 1); - assertQuerySucceeds("ANALYZE test_stat_data_size"); TableStatistics stats = getTableStats("test_stat_data_size"); stats.getColumnStatistics().entrySet().stream() .filter((e) -> ((IcebergColumnHandle) e.getKey()).getColumnType() != SYNTHESIZED) diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java index 4b0e8979887b6..7d84025f72ba5 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java @@ -13,6 +13,7 @@ */ package com.facebook.presto.iceberg; +import com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy; import com.google.common.collect.ImmutableMap; import org.apache.iceberg.hadoop.HadoopFileIO; import org.testng.annotations.Test; @@ -28,8 +29,7 @@ import static com.facebook.presto.iceberg.CatalogType.HIVE; import static com.facebook.presto.iceberg.IcebergFileFormat.ORC; import static com.facebook.presto.iceberg.IcebergFileFormat.PARQUET; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; +import static com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy.USE_NDV; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_EXPIRATION_INTERVAL_MS_DEFAULT; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_MAX_CONTENT_LENGTH_DEFAULT; import static org.apache.iceberg.CatalogProperties.IO_MANIFEST_CACHE_MAX_TOTAL_BYTES_DEFAULT; @@ -46,7 +46,7 @@ public void testDefaults() .setCatalogWarehouse(null) .setCatalogCacheSize(10) .setHadoopConfigResources(null) - .setHiveStatisticsMergeFlags("") + .setHiveStatisticsMergeStrategy(HiveStatisticsMergeStrategy.NONE) .setStatisticSnapshotRecordDifferenceWeight(0.0) .setMaxPartitionsPerWriter(100) .setMinimumAssignedSplitWeight(0.05) @@ -76,7 +76,7 @@ public void testExplicitPropertyMappings() .put("iceberg.enable-parquet-dereference-pushdown", "false") .put("iceberg.enable-merge-on-read-mode", "false") .put("iceberg.statistic-snapshot-record-difference-weight", "1.0") - .put("iceberg.hive-statistics-merge-strategy", NUMBER_OF_DISTINCT_VALUES.name() + "," + TOTAL_SIZE_IN_BYTES.name()) + .put("iceberg.hive-statistics-merge-strategy", "USE_NDV") .put("iceberg.pushdown-filter-enabled", "true") .put("iceberg.delete-as-join-rewrite-enabled", "false") .put("iceberg.io.manifest.cache-enabled", "true") @@ -98,7 +98,7 @@ public void testExplicitPropertyMappings() .setStatisticSnapshotRecordDifferenceWeight(1.0) .setParquetDereferencePushdownEnabled(false) .setMergeOnReadModeEnabled(false) - .setHiveStatisticsMergeFlags("NUMBER_OF_DISTINCT_VALUES,TOTAL_SIZE_IN_BYTES") + .setHiveStatisticsMergeStrategy(USE_NDV) .setPushdownFilterEnabled(true) .setDeleteAsJoinRewriteEnabled(false) .setManifestCachingEnabled(true) diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergTableChangelog.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergTableChangelog.java index 9adce1fe7deb0..e5c5a65a86e02 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergTableChangelog.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergTableChangelog.java @@ -22,6 +22,7 @@ import org.testng.annotations.Test; import java.util.Arrays; +import java.util.Map; import java.util.stream.Collectors; import static com.facebook.presto.iceberg.IcebergQueryRunner.createIcebergQueryRunner; @@ -33,7 +34,8 @@ public class TestIcebergTableChangelog protected QueryRunner createQueryRunner() throws Exception { - return createIcebergQueryRunner(ImmutableMap.of(), CatalogType.HADOOP); + Map properties = ImmutableMap.of("http-server.http.port", "8080"); + return createIcebergQueryRunner(properties, CatalogType.HADOOP); } private long[] snapshots = new long[0]; diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestStatisticsUtil.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestStatisticsUtil.java index c55fc147f392f..6d412d20bcda2 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestStatisticsUtil.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestStatisticsUtil.java @@ -13,13 +13,10 @@ */ package com.facebook.presto.iceberg; -import com.facebook.presto.common.type.Type; import com.facebook.presto.hive.HiveBasicStatistics; import com.facebook.presto.hive.metastore.HiveColumnStatistics; import com.facebook.presto.hive.metastore.PartitionStatistics; import com.facebook.presto.iceberg.ColumnIdentity.TypeCategory; -import com.facebook.presto.spi.PrestoException; -import com.facebook.presto.spi.statistics.ColumnStatisticType; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; @@ -31,226 +28,89 @@ import org.testng.annotations.Test; import java.util.Collections; -import java.util.EnumSet; -import java.util.Map; import java.util.Optional; import java.util.OptionalLong; -import java.util.stream.Collectors; import static com.facebook.presto.common.type.IntegerType.INTEGER; -import static com.facebook.presto.common.type.VarcharType.VARCHAR; import static com.facebook.presto.hive.BaseHiveColumnHandle.ColumnType.REGULAR; -import static com.facebook.presto.iceberg.util.StatisticsUtil.SUPPORTED_MERGE_FLAGS; -import static com.facebook.presto.iceberg.util.StatisticsUtil.calculateAndSetTableSize; -import static com.facebook.presto.iceberg.util.StatisticsUtil.decodeMergeFlags; -import static com.facebook.presto.iceberg.util.StatisticsUtil.encodeMergeFlags; +import static com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy.NONE; +import static com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy.USE_NDV; +import static com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy.USE_NULLS_FRACTIONS; +import static com.facebook.presto.iceberg.util.HiveStatisticsMergeStrategy.USE_NULLS_FRACTION_AND_NDV; import static com.facebook.presto.iceberg.util.StatisticsUtil.mergeHiveStatistics; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_NON_NULL_VALUES; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertThrows; -import static org.testng.Assert.assertTrue; public class TestStatisticsUtil { @Test public void testMergeStrategyNone() { - TableStatistics merged = mergeHiveStatistics(generateSingleColumnIcebergStats(), generateSingleColumnHiveStatistics(), EnumSet.noneOf(ColumnStatisticType.class), PartitionSpec.unpartitioned()); - assertEquals(merged.getRowCount(), Estimate.of(1)); - assertEquals(merged.getTotalSize(), Estimate.of(16)); - assertEquals(merged.getColumnStatistics().size(), 2); - Map columnStats = mapToString(merged); - ColumnStatistics stats = columnStats.get("testint"); - assertEquals(stats.getDataSize(), Estimate.of(8)); - assertEquals(stats.getDistinctValuesCount(), Estimate.of(1)); - assertEquals(stats.getNullsFraction(), Estimate.of(0.1)); - assertEquals(stats.getRange().get(), new DoubleRange(0.0, 1.0)); - stats = columnStats.get("testvarchar"); - assertEquals(stats.getDataSize(), Estimate.of(8)); - assertEquals(stats.getDistinctValuesCount(), Estimate.of(1)); - assertEquals(stats.getNullsFraction(), Estimate.of(0.25)); + TableStatistics merged = mergeHiveStatistics(generateSingleColumnIcebergStats(), generateSingleColumnHiveStatistics(), NONE, PartitionSpec.unpartitioned()); + assertEquals(Estimate.of(1), merged.getRowCount()); + assertEquals(Estimate.unknown(), merged.getTotalSize()); + assertEquals(1, merged.getColumnStatistics().size()); + ColumnStatistics stats = merged.getColumnStatistics().values().stream().findFirst().get(); + assertEquals(Estimate.of(8), stats.getDataSize()); + assertEquals(Estimate.of(1), stats.getDistinctValuesCount()); + assertEquals(Estimate.of(0.1), stats.getNullsFraction()); + assertEquals(new DoubleRange(0.0, 1.0), stats.getRange().get()); } @Test public void testMergeStrategyWithPartitioned() { - TableStatistics merged = mergeHiveStatistics(generateSingleColumnIcebergStats(), generateSingleColumnHiveStatistics(), EnumSet.of(NUMBER_OF_DISTINCT_VALUES, NUMBER_OF_NON_NULL_VALUES), + TableStatistics merged = mergeHiveStatistics(generateSingleColumnIcebergStats(), generateSingleColumnHiveStatistics(), USE_NULLS_FRACTION_AND_NDV, PartitionSpec.builderFor(new Schema(Types.NestedField.required(0, "test", Types.IntegerType.get()))).bucket("test", 100).build()); - assertEquals(merged.getRowCount(), Estimate.of(1)); - assertEquals(merged.getTotalSize(), Estimate.unknown()); - assertEquals(merged.getColumnStatistics().size(), 2); - Map columnStats = mapToString(merged); - ColumnStatistics stats = columnStats.get("testint"); - assertEquals(stats.getDataSize(), Estimate.of(8)); - assertEquals(stats.getDistinctValuesCount(), Estimate.of(1)); - assertEquals(stats.getNullsFraction(), Estimate.of(0.1)); - assertEquals(stats.getRange().get(), new DoubleRange(0.0, 1.0)); - stats = columnStats.get("testvarchar"); - assertEquals(stats.getDataSize(), Estimate.of(8)); - assertEquals(stats.getDistinctValuesCount(), Estimate.of(1)); - assertEquals(stats.getNullsFraction(), Estimate.of(0.25)); + assertEquals(Estimate.of(1), merged.getRowCount()); + assertEquals(Estimate.unknown(), merged.getTotalSize()); + assertEquals(1, merged.getColumnStatistics().size()); + ColumnStatistics stats = merged.getColumnStatistics().values().stream().findFirst().get(); + assertEquals(Estimate.of(8), stats.getDataSize()); + assertEquals(Estimate.of(1), stats.getDistinctValuesCount()); + assertEquals(Estimate.of(0.1), stats.getNullsFraction()); + assertEquals(new DoubleRange(0.0, 1.0), stats.getRange().get()); } @Test public void testMergeStrategyNDVs() { - TableStatistics merged = mergeHiveStatistics(generateSingleColumnIcebergStats(), generateSingleColumnHiveStatistics(), EnumSet.of(NUMBER_OF_DISTINCT_VALUES), PartitionSpec.unpartitioned()); - assertEquals(merged.getRowCount(), Estimate.of(1)); - assertEquals(merged.getTotalSize(), Estimate.of(16)); - assertEquals(merged.getColumnStatistics().size(), 2); - Map columnStats = mapToString(merged); - ColumnStatistics stats = columnStats.get("testint"); - assertEquals(stats.getDataSize(), Estimate.of(8)); - assertEquals(stats.getDistinctValuesCount(), Estimate.of(2)); - assertEquals(stats.getNullsFraction(), Estimate.of(0.1)); - assertEquals(stats.getRange().get(), new DoubleRange(0.0, 1.0)); - stats = columnStats.get("testvarchar"); - assertEquals(stats.getDataSize(), Estimate.of(8)); - assertEquals(stats.getDistinctValuesCount(), Estimate.of(2)); - assertEquals(stats.getNullsFraction(), Estimate.of(0.25)); + TableStatistics merged = mergeHiveStatistics(generateSingleColumnIcebergStats(), generateSingleColumnHiveStatistics(), USE_NDV, PartitionSpec.unpartitioned()); + assertEquals(Estimate.of(1), merged.getRowCount()); + assertEquals(Estimate.unknown(), merged.getTotalSize()); + assertEquals(1, merged.getColumnStatistics().size()); + ColumnStatistics stats = merged.getColumnStatistics().values().stream().findFirst().get(); + assertEquals(Estimate.of(8), stats.getDataSize()); + assertEquals(Estimate.of(2), stats.getDistinctValuesCount()); + assertEquals(Estimate.of(0.1), stats.getNullsFraction()); + assertEquals(new DoubleRange(0.0, 1.0), stats.getRange().get()); } @Test - public void testMergeStrategyDataSize() + public void testMergeStrategyNulls() { - TableStatistics merged = mergeHiveStatistics(generateSingleColumnIcebergStats(), generateSingleColumnHiveStatistics(), EnumSet.of(TOTAL_SIZE_IN_BYTES), PartitionSpec.unpartitioned()); - assertEquals(merged.getRowCount(), Estimate.of(1)); - assertEquals(merged.getTotalSize(), Estimate.of(22)); - assertEquals(merged.getColumnStatistics().size(), 2); - Map columnStats = mapToString(merged); - ColumnStatistics stats = columnStats.get("testint"); - assertEquals(stats.getDataSize(), Estimate.of(8)); - assertEquals(stats.getDistinctValuesCount(), Estimate.of(1)); - assertEquals(stats.getNullsFraction(), Estimate.of(0.1)); - assertEquals(stats.getRange().get(), new DoubleRange(0.0, 1.0)); - stats = columnStats.get("testvarchar"); - assertEquals(stats.getDataSize(), Estimate.of(14)); - assertEquals(stats.getDistinctValuesCount(), Estimate.of(1)); - assertEquals(stats.getNullsFraction(), Estimate.of(0.25)); + TableStatistics merged = mergeHiveStatistics(generateSingleColumnIcebergStats(), generateSingleColumnHiveStatistics(), USE_NULLS_FRACTIONS, PartitionSpec.unpartitioned()); + assertEquals(Estimate.of(1), merged.getRowCount()); + assertEquals(Estimate.unknown(), merged.getTotalSize()); + assertEquals(1, merged.getColumnStatistics().size()); + ColumnStatistics stats = merged.getColumnStatistics().values().stream().findFirst().get(); + assertEquals(Estimate.of(8), stats.getDataSize()); + assertEquals(Estimate.of(1), stats.getDistinctValuesCount()); + assertEquals(Estimate.of(1.0), stats.getNullsFraction()); + assertEquals(new DoubleRange(0.0, 1.0), stats.getRange().get()); } @Test public void testMergeStrategyNDVsAndNulls() { - TableStatistics merged = mergeHiveStatistics(generateSingleColumnIcebergStats(), generateSingleColumnHiveStatistics(), EnumSet.of(NUMBER_OF_DISTINCT_VALUES, TOTAL_SIZE_IN_BYTES), PartitionSpec.unpartitioned()); - assertEquals(merged.getRowCount(), Estimate.of(1)); - assertEquals(merged.getTotalSize(), Estimate.of(22)); - assertEquals(merged.getColumnStatistics().size(), 2); - Map columnStats = mapToString(merged); - ColumnStatistics stats = columnStats.get("testint"); - assertEquals(stats.getDataSize(), Estimate.of(8)); - assertEquals(stats.getDistinctValuesCount(), Estimate.of(2)); - assertEquals(stats.getNullsFraction(), Estimate.of(0.1)); - assertEquals(stats.getRange().get(), new DoubleRange(0.0, 1.0)); - stats = columnStats.get("testvarchar"); - assertEquals(stats.getDataSize(), Estimate.of(14)); - assertEquals(stats.getDistinctValuesCount(), Estimate.of(2)); - assertEquals(stats.getNullsFraction(), Estimate.of(0.25)); - } - - @Test - public void testEncodeDecode() - { - assertTrue(decodeMergeFlags("").isEmpty()); - assertEquals(decodeMergeFlags(NUMBER_OF_DISTINCT_VALUES.name()), EnumSet.of(NUMBER_OF_DISTINCT_VALUES)); - assertEquals(decodeMergeFlags(NUMBER_OF_DISTINCT_VALUES + "," + TOTAL_SIZE_IN_BYTES), - EnumSet.of(NUMBER_OF_DISTINCT_VALUES, TOTAL_SIZE_IN_BYTES)); - assertEquals(decodeMergeFlags(NUMBER_OF_DISTINCT_VALUES + "," + TOTAL_SIZE_IN_BYTES + "," + TOTAL_SIZE_IN_BYTES), - EnumSet.of(NUMBER_OF_DISTINCT_VALUES, TOTAL_SIZE_IN_BYTES)); - - assertEquals(encodeMergeFlags(EnumSet.noneOf(ColumnStatisticType.class)), ""); - assertEquals(encodeMergeFlags(EnumSet.of(NUMBER_OF_DISTINCT_VALUES)), NUMBER_OF_DISTINCT_VALUES.name()); - assertEquals(encodeMergeFlags(EnumSet.of(NUMBER_OF_DISTINCT_VALUES, TOTAL_SIZE_IN_BYTES)), - NUMBER_OF_DISTINCT_VALUES.name() + "," + TOTAL_SIZE_IN_BYTES.name()); - assertEquals(encodeMergeFlags(EnumSet.of(NUMBER_OF_DISTINCT_VALUES, TOTAL_SIZE_IN_BYTES)), - NUMBER_OF_DISTINCT_VALUES.name() + "," + TOTAL_SIZE_IN_BYTES.name()); - - EnumSet invalidFlags = EnumSet.allOf(ColumnStatisticType.class); - SUPPORTED_MERGE_FLAGS.forEach(invalidFlags::remove); - // throw on encode - invalidFlags.forEach(flag -> assertThrows(PrestoException.class, () -> { - encodeMergeFlags(EnumSet.of(flag)); - })); - - // throw on decode - invalidFlags.forEach(flag -> assertThrows(PrestoException.class, () -> { - decodeMergeFlags(flag.toString()); - })); - } - - @Test - public void testCalculateTableSize() - { - // 1 row, but no column statistics - assertEquals(calculateAndSetTableSize(TableStatistics.builder() - .setRowCount(Estimate.of(1.0))).build().getTotalSize(), Estimate.of(0.0)); - // unknown rows - assertEquals(calculateAndSetTableSize(TableStatistics.builder() - .setRowCount(Estimate.unknown())).build().getTotalSize(), Estimate.unknown()); - // non-fixed width column with known data size - assertEquals(calculateAndSetTableSize(TableStatistics.builder() - .setRowCount(Estimate.of(100.0)) - .setColumnStatistics(handleWithName("c1", VARCHAR), ColumnStatistics.builder() - .setDataSize(Estimate.of(100_000)) - .build())) - .build() - .getTotalSize(), Estimate.of(100_000)); - // fixed-width column with some nulls - assertEquals(calculateAndSetTableSize(TableStatistics.builder() - .setRowCount(Estimate.of(100.0)) - .setColumnStatistics(handleWithName("c1", INTEGER), ColumnStatistics.builder() - .setNullsFraction(Estimate.of(0.2)) - .build())) - .build() - .getTotalSize(), Estimate.of(INTEGER.getFixedSize() * 100 * (1 - 0.2))); - // fixed-width column with all nulls - assertEquals(calculateAndSetTableSize(TableStatistics.builder() - .setRowCount(Estimate.of(100.0)) - .setColumnStatistics(handleWithName("c1", INTEGER), ColumnStatistics.builder() - .setNullsFraction(Estimate.of(1.0)) - .build())) - .build() - .getTotalSize(), Estimate.of(0.0)); - - // two columns which are fixed and non-fixed widths added together with some nulls - assertEquals(calculateAndSetTableSize(TableStatistics.builder() - .setRowCount(Estimate.of(100.0)) - .setColumnStatistics(handleWithName("c1", INTEGER), ColumnStatistics.builder() - .setNullsFraction(Estimate.of(0.2)) - .build()) - .setColumnStatistics(handleWithName("c2", VARCHAR), ColumnStatistics.builder() - .setDataSize(Estimate.of(12345)) - .setNullsFraction(Estimate.of(0.5)) - .build())) - .build() - .getTotalSize(), Estimate.of(12345 + (INTEGER.getFixedSize() * 100 * (1 - 0.2)))); - - // two columns which are fixed and non-fixed widths but null fraction is unknown on the fixed-width column - assertEquals(calculateAndSetTableSize(TableStatistics.builder() - .setRowCount(Estimate.of(100.0)) - .setColumnStatistics(handleWithName("c1", INTEGER), ColumnStatistics.builder() - .setNullsFraction(Estimate.unknown()) - .build()) - .setColumnStatistics(handleWithName("c2", VARCHAR), ColumnStatistics.builder() - .setDataSize(Estimate.of(12345)) - .setNullsFraction(Estimate.of(0.5)) - .build())) - .build() - .getTotalSize(), Estimate.unknown()); - // two columns which are fixed and non-fixed widths but data size is unknown on non-fixed-width column - assertEquals(calculateAndSetTableSize(TableStatistics.builder() - .setRowCount(Estimate.of(100.0)) - .setColumnStatistics(handleWithName("c1", INTEGER), ColumnStatistics.builder() - .setNullsFraction(Estimate.of(0.5)) - .build()) - .setColumnStatistics(handleWithName("c2", VARCHAR), ColumnStatistics.builder() - .setDataSize(Estimate.unknown()) - .setNullsFraction(Estimate.of(0.5)) - .build())) - .build() - .getTotalSize(), Estimate.unknown()); + TableStatistics merged = mergeHiveStatistics(generateSingleColumnIcebergStats(), generateSingleColumnHiveStatistics(), USE_NULLS_FRACTION_AND_NDV, PartitionSpec.unpartitioned()); + assertEquals(Estimate.of(1), merged.getRowCount()); + assertEquals(Estimate.unknown(), merged.getTotalSize()); + assertEquals(1, merged.getColumnStatistics().size()); + ColumnStatistics stats = merged.getColumnStatistics().values().stream().findFirst().get(); + assertEquals(Estimate.of(8), stats.getDataSize()); + assertEquals(Estimate.of(2), stats.getDistinctValuesCount()); + assertEquals(Estimate.of(1.0), stats.getNullsFraction()); + assertEquals(new DoubleRange(0.0, 1.0), stats.getRange().get()); } private static TableStatistics generateSingleColumnIcebergStats() @@ -259,7 +119,7 @@ private static TableStatistics generateSingleColumnIcebergStats() .setRowCount(Estimate.of(1)) .setTotalSize(Estimate.unknown()) .setColumnStatistics(new IcebergColumnHandle( - new ColumnIdentity(1, "testint", TypeCategory.PRIMITIVE, Collections.emptyList()), + new ColumnIdentity(1, "test", TypeCategory.PRIMITIVE, Collections.emptyList()), INTEGER, Optional.empty(), REGULAR), @@ -269,48 +129,18 @@ private static TableStatistics generateSingleColumnIcebergStats() .setDataSize(Estimate.of(8)) .setDistinctValuesCount(Estimate.of(1)) .build()) - .setColumnStatistics(new IcebergColumnHandle( - new ColumnIdentity(1, "testvarchar", TypeCategory.PRIMITIVE, Collections.emptyList()), - VARCHAR, - Optional.empty(), - REGULAR), - ColumnStatistics.builder() - .setNullsFraction(Estimate.of(0.25)) - .setDataSize(Estimate.of(8)) - .setDistinctValuesCount(Estimate.of(1)) - .build()) .build(); } - private static IcebergColumnHandle handleWithName(String name, Type type) - { - return new IcebergColumnHandle( - new ColumnIdentity(1, name, TypeCategory.PRIMITIVE, Collections.emptyList()), - type, - Optional.empty(), - REGULAR); - } - private static PartitionStatistics generateSingleColumnHiveStatistics() { return new PartitionStatistics( new HiveBasicStatistics(1, 2, 16, 16), - ImmutableMap.of("testint", + ImmutableMap.of("test", HiveColumnStatistics.createIntegerColumnStatistics( OptionalLong.of(1), OptionalLong.of(2), OptionalLong.of(2), - OptionalLong.of(2)), - "testvarchar", - HiveColumnStatistics.createStringColumnStatistics( - OptionalLong.of(4L), - OptionalLong.of(14L), - OptionalLong.of(2), OptionalLong.of(2)))); } - - private static Map mapToString(TableStatistics statistics) - { - return statistics.getColumnStatistics().entrySet().stream().collect(Collectors.toMap(entry -> ((IcebergColumnHandle) entry.getKey()).getName(), Map.Entry::getValue)); - } } diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java index 039ef121ac4e2..8a7ef1a5fb560 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java @@ -20,7 +20,6 @@ import com.facebook.presto.metadata.CatalogManager; import com.facebook.presto.spi.ConnectorId; import com.facebook.presto.spi.SchemaTableName; -import com.google.common.base.Joiner; import com.google.common.collect.ImmutableMap; import org.apache.iceberg.Table; import org.testng.annotations.Test; @@ -28,8 +27,6 @@ import static com.facebook.presto.hive.metastore.InMemoryCachingHiveMetastore.memoizeMetastore; import static com.facebook.presto.iceberg.CatalogType.HIVE; import static com.facebook.presto.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; @Test public class TestIcebergDistributedHive @@ -37,7 +34,7 @@ public class TestIcebergDistributedHive { public TestIcebergDistributedHive() { - super(HIVE, ImmutableMap.of("iceberg.hive-statistics-merge-strategy", Joiner.on(",").join(NUMBER_OF_DISTINCT_VALUES.name(), TOTAL_SIZE_IN_BYTES.name()))); + super(HIVE, ImmutableMap.of("iceberg.hive-statistics-merge-strategy", "USE_NULLS_FRACTION_AND_NDV")); } @Override diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergHiveStatistics.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergHiveStatistics.java index 8df787a8bac1c..c6c4d80896abc 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergHiveStatistics.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergHiveStatistics.java @@ -29,7 +29,6 @@ import com.facebook.presto.spi.analyzer.MetadataResolver; import com.facebook.presto.spi.security.AllowAllAccessControl; import com.facebook.presto.spi.statistics.ColumnStatistics; -import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.testing.MaterializedResult; import com.facebook.presto.testing.QueryRunner; @@ -51,9 +50,6 @@ import static com.facebook.presto.hive.BaseHiveColumnHandle.ColumnType.PARTITION_KEY; import static com.facebook.presto.hive.BaseHiveColumnHandle.ColumnType.REGULAR; import static com.facebook.presto.iceberg.IcebergQueryRunner.createIcebergQueryRunner; -import static com.facebook.presto.iceberg.IcebergSessionProperties.HIVE_METASTORE_STATISTICS_MERGE_STRATEGY; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; -import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; import static com.facebook.presto.testing.assertions.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotEquals; @@ -71,7 +67,7 @@ public class TestIcebergHiveStatistics protected QueryRunner createQueryRunner() throws Exception { - return createIcebergQueryRunner(ImmutableMap.of(), ImmutableMap.of("iceberg.hive-statistics-merge-strategy", NUMBER_OF_DISTINCT_VALUES.name())); + return createIcebergQueryRunner(ImmutableMap.of(), ImmutableMap.of("iceberg.hive-statistics-merge-strategy", "USE_NDV")); } private static final Set NUMERIC_ORDERS_COLUMNS = ImmutableSet.builder() @@ -281,67 +277,6 @@ public void testStatsWithPartitionedTablesNoAnalyze() assertQuerySucceeds("DROP TABLE statsWithPartition"); } - @Test - public void testHiveStatisticsMergeFlags() - { - assertQuerySucceeds("CREATE TABLE mergeFlagsStats (i int, v varchar)"); - assertQuerySucceeds("INSERT INTO mergeFlagsStats VALUES (0, '1'), (1, '22'), (2, '333'), (NULL, 'aaaaa'), (4, NULL)"); - assertQuerySucceeds("ANALYZE mergeFlagsStats"); // stats stored in - // Test stats without merging doesn't return NDVs or data size - Session session = Session.builder(getSession()) - .setCatalogSessionProperty("iceberg", HIVE_METASTORE_STATISTICS_MERGE_STRATEGY, "") - .build(); - TableStatistics stats = getTableStatistics(session, "mergeFlagsStats"); - Map columnStatistics = getColumnNameMap(stats); - assertEquals(columnStatistics.get("i").getDistinctValuesCount(), Estimate.unknown()); - assertEquals(columnStatistics.get("i").getDataSize(), Estimate.unknown()); - assertEquals(columnStatistics.get("v").getDistinctValuesCount(), Estimate.unknown()); - assertEquals(columnStatistics.get("v").getDataSize(), Estimate.unknown()); - - // Test stats merging for NDVs - session = Session.builder(getSession()) - .setCatalogSessionProperty("iceberg", HIVE_METASTORE_STATISTICS_MERGE_STRATEGY, NUMBER_OF_DISTINCT_VALUES.name()) - .build(); - stats = getTableStatistics(session, "mergeFlagsStats"); - columnStatistics = getColumnNameMap(stats); - assertEquals(columnStatistics.get("i").getDistinctValuesCount(), Estimate.of(4.0)); - assertEquals(columnStatistics.get("i").getDataSize(), Estimate.unknown()); - assertEquals(columnStatistics.get("v").getDistinctValuesCount(), Estimate.of(4.0)); - assertEquals(columnStatistics.get("v").getDataSize(), Estimate.unknown()); - - // Test stats for data size - session = Session.builder(getSession()) - .setCatalogSessionProperty("iceberg", HIVE_METASTORE_STATISTICS_MERGE_STRATEGY, TOTAL_SIZE_IN_BYTES.name()) - .build(); - stats = getTableStatistics(session, "mergeFlagsStats"); - columnStatistics = getColumnNameMap(stats); - assertEquals(columnStatistics.get("i").getDistinctValuesCount(), Estimate.unknown()); - assertEquals(columnStatistics.get("i").getDataSize(), Estimate.unknown()); // fixed-width isn't collected - assertEquals(columnStatistics.get("v").getDistinctValuesCount(), Estimate.unknown()); - assertEquals(columnStatistics.get("v").getDataSize(), Estimate.of(11)); - - // Test stats for both - session = Session.builder(getSession()) - .setCatalogSessionProperty("iceberg", HIVE_METASTORE_STATISTICS_MERGE_STRATEGY, NUMBER_OF_DISTINCT_VALUES.name() + "," + TOTAL_SIZE_IN_BYTES) - .build(); - stats = getTableStatistics(session, "mergeFlagsStats"); - columnStatistics = getColumnNameMap(stats); - assertEquals(columnStatistics.get("i").getDistinctValuesCount(), Estimate.of(4.0)); - assertEquals(columnStatistics.get("i").getDataSize(), Estimate.unknown()); - assertEquals(columnStatistics.get("v").getDistinctValuesCount(), Estimate.of(4.0)); - assertEquals(columnStatistics.get("v").getDataSize(), Estimate.of(11)); - } - - private TableStatistics getTableStatistics(Session session, String table) - { - Metadata meta = getQueryRunner().getMetadata(); - TransactionId txid = getQueryRunner().getTransactionManager().beginTransaction(false); - Session txnSession = session.beginTransactionId(txid, getQueryRunner().getTransactionManager(), new AllowAllAccessControl()); - Map columnHandles = getColumnHandles(table, txnSession); - List columnHandleList = new ArrayList<>(columnHandles.values()); - return meta.getTableStatistics(txnSession, getAnalyzeTableHandle(table, txnSession), columnHandleList, Constraint.alwaysTrue()); - } - private void columnStatsEqual(Map actualStats, Map expectedStats) { for (ColumnHandle handle : expectedStats.keySet()) { @@ -415,13 +350,6 @@ static void assertStatValue(StatsSchema column, MaterializedResult result, Set getColumnNameMap(TableStatistics statistics) - { - return statistics.getColumnStatistics().entrySet().stream().collect(Collectors.toMap(e -> - ((IcebergColumnHandle) e.getKey()).getName(), - Map.Entry::getValue)); - } - static void assertNDVsPresent(TableStatistics stats) { for (Map.Entry entry : stats.getColumnStatistics().entrySet()) { diff --git a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java index f1679c77eea61..6c9e64e795118 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java @@ -74,7 +74,6 @@ import java.util.Objects; import java.util.Optional; import java.util.Set; -import java.util.stream.Collectors; import static com.facebook.presto.common.type.DateType.DATE; import static com.facebook.presto.common.type.SqlTimestamp.MICROSECONDS_PER_MILLISECOND; @@ -205,15 +204,10 @@ private Node rewriteShowStats(ShowStats node, Table table, Constraint columns.contains(entry.getKey())) .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue)); } - TableMetadata tableMetadata = metadata.getTableMetadata(session, tableHandle); - List nonHiddenColumns = ImmutableList.copyOf(tableMetadata.getColumns().stream().filter(column -> !column.isHidden()) - .map(ColumnMetadata::getName) - .map(columnHandles::get) - .filter(Objects::nonNull) - .collect(Collectors.toList())); - TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, nonHiddenColumns, constraint); + TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, ImmutableList.copyOf(columnHandles.values()), constraint); List statsColumnNames = buildColumnsNames(); List selectItems = buildSelectItems(statsColumnNames); + TableMetadata tableMetadata = metadata.getTableMetadata(session, tableHandle); List resultRows = buildStatisticsRows(tableMetadata, columnHandles, tableStatistics); return simpleQuery(selectAll(selectItems), diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java index 3e3c7c74609e1..93e72d3dcf64c 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/TableStatistics.java @@ -17,7 +17,6 @@ import com.facebook.presto.spi.ColumnHandle; import com.fasterxml.jackson.annotation.JsonProperty; -import java.util.Collections; import java.util.LinkedHashMap; import java.util.Map; import java.util.Objects; @@ -118,11 +117,6 @@ public Builder setRowCount(Estimate rowCount) return this; } - public Estimate getRowCount() - { - return rowCount; - } - public Builder setTotalSize(Estimate totalSize) { this.totalSize = requireNonNull(totalSize, "totalSize can not be null"); @@ -137,11 +131,6 @@ public Builder setColumnStatistics(ColumnHandle columnHandle, ColumnStatistics c return this; } - public Map getColumnStatistics() - { - return Collections.unmodifiableMap(columnStatisticsMap); - } - public TableStatistics build() { return new TableStatistics(rowCount, totalSize, columnStatisticsMap); From 48c8a944f7b8cd56fe9ea0d8d75a2cb004aaf4f6 Mon Sep 17 00:00:00 2001 From: feilong-liu Date: Fri, 3 May 2024 10:36:58 -0700 Subject: [PATCH 3/4] Revert "[Iceberg] Add predicate in layout without pushdown_filter_enabled" This reverts commit b2d45321d62bcd8e19cefa131dacc82dcbe59a6a. --- .../iceberg/IcebergAbstractMetadata.java | 73 ++++++++++--------- .../presto/iceberg/TableStatisticsMaker.java | 4 - .../iceberg/IcebergDistributedTestBase.java | 60 +-------------- .../hive/TestIcebergDistributedHive.java | 6 -- .../presto/sql/rewrite/ShowStatsRewrite.java | 1 - .../spi/statistics/ColumnStatistics.java | 15 ---- 6 files changed, 41 insertions(+), 118 deletions(-) diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java index e1b140588f023..01e1b161ed4c7 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java @@ -103,6 +103,7 @@ import java.util.stream.Collectors; import static com.facebook.presto.expressions.LogicalRowExpressions.TRUE_CONSTANT; +import static com.facebook.presto.hive.MetadataUtils.createPredicate; import static com.facebook.presto.hive.MetadataUtils.getCombinedRemainingPredicate; import static com.facebook.presto.hive.MetadataUtils.getDiscretePredicates; import static com.facebook.presto.hive.MetadataUtils.getPredicate; @@ -215,14 +216,13 @@ public ConnectorTableLayoutResult getTableLayoutForConstraint( IcebergTableHandle handle = (IcebergTableHandle) table; Table icebergTable = getIcebergTable(session, handle.getSchemaTableName()); - List partitionColumns = getPartitionKeyColumnHandles(icebergTable, typeManager); - TupleDomain partitionColumnPredicate = TupleDomain.withColumnDomains(Maps.filterKeys(constraint.getSummary().getDomains().get(), Predicates.in(partitionColumns))); + TupleDomain partitionColumnPredicate = TupleDomain.withColumnDomains(Maps.filterKeys(constraint.getSummary().getDomains().get(), Predicates.in(getPartitionKeyColumnHandles(icebergTable, typeManager)))); Optional> requestedColumns = desiredColumns.map(columns -> columns.stream().map(column -> (IcebergColumnHandle) column).collect(toImmutableSet())); ConnectorTableLayout layout = getTableLayout( session, new IcebergTableLayoutHandle.Builder() - .setPartitionColumns(ImmutableList.copyOf(partitionColumns)) + .setPartitionColumns(ImmutableList.copyOf(getPartitionKeyColumnHandles(icebergTable, typeManager))) .setDataColumns(toHiveColumns(icebergTable.schema().columns())) .setDomainPredicate(constraint.getSummary().transform(IcebergAbstractMetadata::toSubfield)) .setRemainingPredicate(TRUE_CONSTANT) @@ -258,52 +258,59 @@ public ConnectorTableLayout getTableLayout(ConnectorSession session, ConnectorTa Table icebergTable = getIcebergTable(session, tableHandle.getSchemaTableName()); validateTableMode(session, icebergTable); - List partitionColumns = ImmutableList.copyOf(icebergTableLayoutHandle.getPartitionColumns()); + if (!isPushdownFilterEnabled(session)) { + return new ConnectorTableLayout(handle); + } + + if (!icebergTableLayoutHandle.getPartitions().isPresent()) { return new ConnectorTableLayout( icebergTableLayoutHandle, Optional.empty(), - icebergTableLayoutHandle.getPartitionColumnPredicate(), + TupleDomain.none(), Optional.empty(), Optional.empty(), Optional.empty(), ImmutableList.of(), Optional.empty()); } - Optional> partitions = icebergTableLayoutHandle.getPartitions(); - Optional discretePredicates = partitions.flatMap(parts -> getDiscretePredicates(partitionColumns, parts)); + List partitionColumns = ImmutableList.copyOf(icebergTableLayoutHandle.getPartitionColumns()); + List partitions = icebergTableLayoutHandle.getPartitions().get(); + + Optional discretePredicates = getDiscretePredicates(partitionColumns, partitions); + + TupleDomain predicate; + RowExpression subfieldPredicate; + if (isPushdownFilterEnabled(session)) { + Map predicateColumns = icebergTableLayoutHandle.getPredicateColumns().entrySet() + .stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); - Map predicateColumns = icebergTableLayoutHandle.getPredicateColumns().entrySet() - .stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); - Optional> predicate = partitions.map(parts -> getPredicate(icebergTableLayoutHandle, partitionColumns, parts, predicateColumns)); - // capture subfields from domainPredicate to add to remainingPredicate - // so those filters don't get lost - Map columnTypes = getColumns(icebergTable.schema(), icebergTable.spec(), typeManager).stream() - .collect(toImmutableMap(IcebergColumnHandle::getName, icebergColumnHandle -> getColumnMetadata(session, tableHandle, icebergColumnHandle).getType())); + predicate = getPredicate(icebergTableLayoutHandle, partitionColumns, partitions, predicateColumns); - RowExpression subfieldPredicate = getSubfieldPredicate(session, icebergTableLayoutHandle, columnTypes, functionResolution, rowExpressionService); + // capture subfields from domainPredicate to add to remainingPredicate + // so those filters don't get lost + Map columnTypes = getColumns(icebergTable.schema(), icebergTable.spec(), typeManager).stream() + .collect(toImmutableMap(IcebergColumnHandle::getName, icebergColumnHandle -> getColumnMetadata(session, tableHandle, icebergColumnHandle).getType())); + + subfieldPredicate = getSubfieldPredicate(session, icebergTableLayoutHandle, columnTypes, functionResolution, rowExpressionService); + } + else { + predicate = createPredicate(partitionColumns, partitions); + subfieldPredicate = TRUE_CONSTANT; + } // combine subfieldPredicate with remainingPredicate RowExpression combinedRemainingPredicate = getCombinedRemainingPredicate(icebergTableLayoutHandle, subfieldPredicate); - return predicate.map(pred -> new ConnectorTableLayout( - icebergTableLayoutHandle, - Optional.empty(), - pred, - Optional.empty(), - Optional.empty(), - discretePredicates, - ImmutableList.of(), - Optional.of(combinedRemainingPredicate))) - .orElseGet(() -> new ConnectorTableLayout( - icebergTableLayoutHandle, - Optional.empty(), - TupleDomain.none(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - ImmutableList.of(), - Optional.empty())); + return new ConnectorTableLayout( + icebergTableLayoutHandle, + Optional.empty(), + predicate, + Optional.empty(), + Optional.empty(), + discretePredicates, + ImmutableList.of(), + Optional.of(combinedRemainingPredicate)); } protected Optional getIcebergSystemTable(SchemaTableName tableName, Table table) diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java index 66c325a5df7db..a7c26a6694456 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java @@ -158,10 +158,6 @@ private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Cons .setRowCount(Estimate.of(0)) .build(); } - // the total record count for the whole table - Optional totalRecordCount = Optional.of(intersection) - .filter(domain -> !domain.isAll()) - .map(domain -> getDataTableSummary(tableHandle, ImmutableList.of(), TupleDomain.all(), idToTypeMapping, nonPartitionPrimitiveColumns, partitionFields).getRecordCount()); double recordCount = summary.getRecordCount(); TableStatistics.Builder result = TableStatistics.builder(); diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java index 258bfefc16065..7ab2a1a411208 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java @@ -40,7 +40,6 @@ import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.testing.MaterializedResult; -import com.facebook.presto.testing.MaterializedRow; import com.facebook.presto.testing.QueryRunner; import com.facebook.presto.tests.AbstractTestDistributedQueries; import com.google.common.collect.ImmutableList; @@ -117,7 +116,7 @@ import static org.testng.Assert.assertTrue; @Test(singleThreaded = true) -public abstract class IcebergDistributedTestBase +public class IcebergDistributedTestBase extends AbstractTestDistributedQueries { private final CatalogType catalogType; @@ -1201,63 +1200,6 @@ public void testEqualityDeletesWithHiddenPartitionsEvolution(String fileFormat, assertQuery(session, "SELECT * FROM " + tableName, "VALUES (1, '1001', NULL, NULL), (3, '1003', NULL, NULL), (6, '1004', 1, NULL), (6, '1006', 2, 'th002')"); } - @Test - public void testPartShowStatsWithFilters() - { - assertQuerySucceeds("CREATE TABLE showstatsfilters (i int) WITH (partitioning = ARRAY['i'])"); - assertQuerySucceeds("INSERT INTO showstatsfilters VALUES 1, 2, 3, 4, 5, 6, 7, 7, 7, 7"); - assertQuerySucceeds("ANALYZE showstatsfilters"); - - MaterializedResult statsTable = getQueryRunner().execute("SHOW STATS for showstatsfilters"); - MaterializedRow columnStats = statsTable.getMaterializedRows().get(0); - assertEquals(columnStats.getField(2), 7.0); // ndvs; - assertEquals(columnStats.getField(3), 0.0); // nulls - assertEquals(columnStats.getField(5), "1"); // min - assertEquals(columnStats.getField(6), "7"); // max - - // EQ - statsTable = getQueryRunner().execute("SHOW STATS for (SELECT * FROM showstatsfilters WHERE i = 7)"); - columnStats = statsTable.getMaterializedRows().get(0); - assertEquals(columnStats.getField(5), "7"); // min - assertEquals(columnStats.getField(6), "7"); // max - assertEquals(columnStats.getField(3), 0.0); // nulls - assertEquals((double) columnStats.getField(2), 7.0d * (4.0d / 10.0d), 1E-8); // ndvs; - - // LT - statsTable = getQueryRunner().execute("SHOW STATS for (SELECT * FROM showstatsfilters WHERE i < 7)"); - columnStats = statsTable.getMaterializedRows().get(0); - assertEquals(columnStats.getField(5), "1"); // min - assertEquals(columnStats.getField(6), "6"); // max - assertEquals(columnStats.getField(3), 0.0); // nulls - assertEquals((double) columnStats.getField(2), 7.0d * (6.0d / 10.0d), 1E-8); // ndvs; - - // LTE - statsTable = getQueryRunner().execute("SHOW STATS for (SELECT * FROM showstatsfilters WHERE i <= 7)"); - columnStats = statsTable.getMaterializedRows().get(0); - assertEquals(columnStats.getField(5), "1"); // min - assertEquals(columnStats.getField(6), "7"); // max - assertEquals(columnStats.getField(3), 0.0); // nulls - assertEquals(columnStats.getField(2), 7.0d); // ndvs; - - // GT - statsTable = getQueryRunner().execute("SHOW STATS for (SELECT * FROM showstatsfilters WHERE i > 7)"); - columnStats = statsTable.getMaterializedRows().get(0); - assertEquals(columnStats.getField(5), null); // min - assertEquals(columnStats.getField(6), null); // max - assertEquals(columnStats.getField(3), null); // nulls - assertEquals(columnStats.getField(2), null); // ndvs; - - // GTE - statsTable = getQueryRunner().execute("SHOW STATS for (SELECT * FROM showstatsfilters WHERE i >= 7)"); - columnStats = statsTable.getMaterializedRows().get(0); - assertEquals(columnStats.getField(5), "7"); // min - assertEquals(columnStats.getField(6), "7"); // max - assertEquals(columnStats.getField(3), 0.0); // nulls - assertEquals((double) columnStats.getField(2), 7.0d * (4.0d / 10.0d), 1E-8); // ndvs; - - assertQuerySucceeds("DROP TABLE showstatsfilters"); - } - private void testCheckDeleteFiles(Table icebergTable, int expectedSize, List expectedFileContent) { // check delete file list diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java index 8a7ef1a5fb560..1945e1122b8b0 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergDistributedHive.java @@ -49,12 +49,6 @@ public void testStatsByDistance() // ignore because HMS doesn't support statistics versioning } - @Override - public void testPartShowStatsWithFilters() - { - // Hive doesn't support returning statistics on partitioned tables - } - @Override protected Table loadTable(String tableName) { diff --git a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java index 6c9e64e795118..60f0b01b662aa 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java @@ -71,7 +71,6 @@ import java.time.LocalDate; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Optional; import java.util.Set; diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java index 8ae5cfc25ce41..f36bffa32e916 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java @@ -139,33 +139,18 @@ public Builder setNullsFraction(Estimate nullsFraction) return this; } - public Estimate getNullsFraction() - { - return nullsFraction; - } - public Builder setDistinctValuesCount(Estimate distinctValuesCount) { this.distinctValuesCount = requireNonNull(distinctValuesCount, "distinctValuesCount is null"); return this; } - public Estimate getDistinctValuesCount() - { - return distinctValuesCount; - } - public Builder setDataSize(Estimate dataSize) { this.dataSize = requireNonNull(dataSize, "dataSize is null"); return this; } - public Estimate getDataSize() - { - return dataSize; - } - public Builder setRange(DoubleRange range) { this.range = Optional.of(requireNonNull(range, "range is null")); From a07d5a46155186a12cad68c7599379cfeca88bed Mon Sep 17 00:00:00 2001 From: feilong-liu Date: Fri, 3 May 2024 11:12:48 -0700 Subject: [PATCH 4/4] Fix testQuickStats in TestParquetDistributedQueries Fix this test as part of revert PR https://github.com/prestodb/presto/pull/22661/ I chose to fix forward because the original PR is mainy for flaky test fix and has light dependency on the reverted PR hence fix forward is a better option. --- .../hive/TestParquetDistributedQueries.java | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestParquetDistributedQueries.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestParquetDistributedQueries.java index e2e3cab39fa31..58ad85ad2b7d4 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestParquetDistributedQueries.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestParquetDistributedQueries.java @@ -131,29 +131,29 @@ public void testQuickStatsPartitionedTable() // Since no stats were collected during write, only the partitioned columns will have stats assertQuery("SHOW STATS FOR test_quick_stats_partitioned", "SELECT * FROM (VALUES " + - " ('suppkey', null, null, null, null, null, null, null), " + - " ('linenumber', null, null, null, null, null, null, null), " + - " ('orderkey', null, 10.0, 0.0, null, 100, 109, null), " + - " ('partkey', null, 10.0, 0.0, null, 1000, 1009, null), " + - " (null, null, null, null, 10.0, null, null, null))"); + " ('suppkey', null, null, null, null, null, null), " + + " ('linenumber', null, null, null, null, null, null), " + + " ('orderkey', null, 10.0, 0.0, null, 100, 109), " + + " ('partkey', null, 10.0, 0.0, null, 1000, 1009), " + + " (null, null, null, null, 10.0, null, null))"); // With quick stats enabled, we should get nulls_fraction, low_value and high_value for all columns assertQuery(session, "SHOW STATS FOR test_quick_stats_partitioned", "SELECT * FROM (VALUES " + - " ('suppkey', null, null, 0.0, null, 1, 10, null), " + - " ('linenumber', null, null, 0.0, null, 1, 10, null), " + - " ('orderkey', null, 10.0, 0.0, null, 100, 109, null), " + - " ('partkey', null, 10.0, 0.0, null, 1000, 1009, null), " + - " (null, null, null, null, 10.0, null, null, null))"); + " ('suppkey', null, null, 0.0, null, 1, 10), " + + " ('linenumber', null, null, 0.0, null, 1, 10), " + + " ('orderkey', null, 10.0, 0.0, null, 100, 109), " + + " ('partkey', null, 10.0, 0.0, null, 1000, 1009), " + + " (null, null, null, null, 10.0, null, null))"); // If a query targets a specific partition, stats are correctly limited to that partition assertQuery(session, "show stats for (select * from test_quick_stats_partitioned where partkey = 1009)", "SELECT * FROM (VALUES " + - " ('suppkey', null, null, 0.0, null, 10, 10, null), " + - " ('linenumber', null, null, 0.0, null, 10, 10, null), " + - " ('orderkey', null, 1.0, 0.0, null, 109, 109, null), " + - " ('partkey', null, 1.0, 0.0, null, 1009, 1009, null), " + - " (null, null, null, null, 1.0, null, null, null))"); + " ('suppkey', null, null, 0.0, null, 10, 10), " + + " ('linenumber', null, null, 0.0, null, 10, 10), " + + " ('orderkey', null, 1.0, 0.0, null, 109, 109), " + + " ('partkey', null, 1.0, 0.0, null, 1009, 1009), " + + " (null, null, null, null, 1.0, null, null))"); } finally { getQueryRunner().execute("DROP TABLE test_quick_stats_partitioned");